FZH1996 commited on Aug 17, 2023

Commit

e7d695a

•

1 Parent(s): fe45bc3

update fed-lora

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

examples/NLG/eval/GenerationEval/bleurt +1 -0
examples/NLG/eval/GenerationEval/metrics/bleurt +1 -0
examples/NLG/eval/e2e/metrics/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/metrics/__pycache__/pymteval.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/__pycache__/eval.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/bleu/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/bleu/__pycache__/bleu.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/bleu/__pycache__/bleu_scorer.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/cider/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/cider/__pycache__/cider.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/cider/__pycache__/cider_scorer.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/meteor/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/meteor/__pycache__/meteor.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/rouge/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/rouge/__pycache__/rouge.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/tokenizer/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocoevalcap/tokenizer/__pycache__/ptbtokenizer.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocotools/__pycache__/__init__.cpython-36.pyc +0 -0
examples/NLG/eval/e2e/pycocotools/__pycache__/__init__.cpython-37.pyc +0 -0
examples/NLG/eval/e2e/pycocotools/__pycache__/coco.cpython-36.pyc +0 -0
examples/NLG/eval/e2e/pycocotools/__pycache__/coco.cpython-37.pyc +0 -0
examples/NLG/src/.DS_Store +0 -0
examples/NLG/src/__pycache__/data_utils.cpython-310.pyc +0 -0
examples/NLG/src/__pycache__/data_utils.cpython-36.pyc +0 -0
examples/NLG/src/__pycache__/data_utils.cpython-37.pyc +0 -0
examples/NLG/src/__pycache__/encoder.cpython-37.pyc +0 -0
examples/NLG/src/__pycache__/exp_utils.cpython-310.pyc +0 -0
examples/NLG/src/__pycache__/exp_utils.cpython-37.pyc +0 -0
examples/NLG/src/__pycache__/gpu.cpython-310.pyc +0 -0
examples/NLG/src/__pycache__/gpu.cpython-36.pyc +0 -0
examples/NLG/src/__pycache__/gpu.cpython-37.pyc +0 -0
examples/NLG/src/__pycache__/model.cpython-310.pyc +0 -0
examples/NLG/src/__pycache__/model.cpython-36.pyc +0 -0
examples/NLG/src/__pycache__/model.cpython-37.pyc +0 -0
examples/NLG/src/__pycache__/optimizer.cpython-36.pyc +0 -0
examples/NLG/src/__pycache__/optimizer.cpython-37.pyc +0 -0
examples/NLG/src/data_utils.py +282 -0
examples/NLG/src/encoder.py +132 -0
examples/NLG/src/exp_utils.py +46 -0
examples/NLG/src/format_converting_dart.py +43 -0
examples/NLG/src/format_converting_e2e.py +20 -0
examples/NLG/src/format_converting_webnlg.py +68 -0
examples/NLG/src/gpt2_beam.py +419 -0
examples/NLG/src/gpt2_decode.py +187 -0
examples/NLG/src/gpt2_encode.py +70 -0
examples/NLG/src/gpt2_ft.py +385 -0
examples/NLG/src/gpu.py +129 -0
examples/NLG/src/model.log +698 -0
examples/NLG/src/model.py +460 -0

examples/NLG/eval/GenerationEval/bleurt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit cebe7e6f996b40910cfaa520a63db47807e3bf5c

examples/NLG/eval/GenerationEval/metrics/bleurt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit cebe7e6f996b40910cfaa520a63db47807e3bf5c

examples/NLG/eval/e2e/metrics/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (169 Bytes). View file

examples/NLG/eval/e2e/metrics/__pycache__/pymteval.cpython-37.pyc ADDED Viewed

Binary file (12.9 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (195 Bytes). View file

examples/NLG/eval/e2e/pycocoevalcap/__pycache__/eval.cpython-37.pyc ADDED Viewed

Binary file (2.57 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/bleu/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (200 Bytes). View file

examples/NLG/eval/e2e/pycocoevalcap/bleu/__pycache__/bleu.cpython-37.pyc ADDED Viewed

Binary file (1.24 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/bleu/__pycache__/bleu_scorer.cpython-37.pyc ADDED Viewed

Binary file (8.07 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/cider/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (201 Bytes). View file

examples/NLG/eval/e2e/pycocoevalcap/cider/__pycache__/cider.cpython-37.pyc ADDED Viewed

Binary file (1.67 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/cider/__pycache__/cider_scorer.cpython-37.pyc ADDED Viewed

Binary file (7.85 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/meteor/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (202 Bytes). View file

examples/NLG/eval/e2e/pycocoevalcap/meteor/__pycache__/meteor.cpython-37.pyc ADDED Viewed

Binary file (2.75 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/rouge/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (203 Bytes). View file

examples/NLG/eval/e2e/pycocoevalcap/rouge/__pycache__/rouge.cpython-37.pyc ADDED Viewed

Binary file (3.75 kB). View file

examples/NLG/eval/e2e/pycocoevalcap/tokenizer/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (205 Bytes). View file

examples/NLG/eval/e2e/pycocoevalcap/tokenizer/__pycache__/ptbtokenizer.cpython-37.pyc ADDED Viewed

Binary file (2.18 kB). View file

examples/NLG/eval/e2e/pycocotools/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (189 Bytes). View file

examples/NLG/eval/e2e/pycocotools/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (193 Bytes). View file

examples/NLG/eval/e2e/pycocotools/__pycache__/coco.cpython-36.pyc ADDED Viewed

Binary file (13.4 kB). View file

examples/NLG/eval/e2e/pycocotools/__pycache__/coco.cpython-37.pyc ADDED Viewed

Binary file (13.4 kB). View file

examples/NLG/src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

examples/NLG/src/__pycache__/data_utils.cpython-310.pyc ADDED Viewed

Binary file (8.49 kB). View file

examples/NLG/src/__pycache__/data_utils.cpython-36.pyc ADDED Viewed

Binary file (8.58 kB). View file

examples/NLG/src/__pycache__/data_utils.cpython-37.pyc ADDED Viewed

Binary file (8.58 kB). View file

examples/NLG/src/__pycache__/encoder.cpython-37.pyc ADDED Viewed

Binary file (5.1 kB). View file

examples/NLG/src/__pycache__/exp_utils.cpython-310.pyc ADDED Viewed

Binary file (1.49 kB). View file

examples/NLG/src/__pycache__/exp_utils.cpython-37.pyc ADDED Viewed

Binary file (1.44 kB). View file

examples/NLG/src/__pycache__/gpu.cpython-310.pyc ADDED Viewed

Binary file (3.58 kB). View file

examples/NLG/src/__pycache__/gpu.cpython-36.pyc ADDED Viewed

Binary file (3.53 kB). View file

examples/NLG/src/__pycache__/gpu.cpython-37.pyc ADDED Viewed

Binary file (3.54 kB). View file

examples/NLG/src/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (13.3 kB). View file

examples/NLG/src/__pycache__/model.cpython-36.pyc ADDED Viewed

Binary file (13.7 kB). View file

examples/NLG/src/__pycache__/model.cpython-37.pyc ADDED Viewed

Binary file (13.5 kB). View file

examples/NLG/src/__pycache__/optimizer.cpython-36.pyc ADDED Viewed

Binary file (11.4 kB). View file

examples/NLG/src/__pycache__/optimizer.cpython-37.pyc ADDED Viewed

Binary file (11.4 kB). View file

examples/NLG/src/data_utils.py ADDED Viewed

	@@ -0,0 +1,282 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import os, sys
+import glob
+import random
+from collections import Counter, OrderedDict
+import numpy as np
+import torch
+import json
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, eval_len=None, device='cpu', world_size=1, rank=0):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.data = data
+        self.bsz = bsz
+        self.world_size = world_size
+        self.rank = rank
+        self.bptt = bptt # tgt_len
+        # existing len.
+        self.eval_len = bptt if eval_len is None else eval_len
+        self.device = device
+        self.global_bsz = bsz * world_size
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = len(data) // self.global_bsz # bsz
+        self.split_data = torch.tensor(
+            data[rank * self.n_step * bsz : (rank + 1) * self.n_step * bsz],
+            dtype=torch.long, device=self.device
+        )  # data.view(-1)
+        self.split_data = self.split_data.view(bsz, -1)
+    def __iter__(self):
+        return self.get_fixlen_iter()
+    def get_batch(self, i, bptt, eval_len):
+        beg_idx = i
+        end_idx = i + bptt # seq_len
+        # batch_size, lengh;
+        _input = self.split_data[:, beg_idx : end_idx].contiguous()
+        _target = self.split_data[:, beg_idx+1 : end_idx+1].contiguous()
+        _msk = torch.cat(
+            [
+                torch.zeros(bptt-eval_len, dtype=torch.float, device=self.device),
+                torch.ones(eval_len, dtype=torch.float, device=self.device)
+            ]
+        )
+        _msk = _msk.unsqueeze(0).expand_as(_input) # .unsqueeze(-1) # length, 1;
+        return _input, _target, _msk
+    def get_fixlen_iter(self, start=0):
+        self.data_len = self.split_data.size(1)
+        _eval_cursor = 0
+        for i in range(start, self.data_len - 1, self.eval_len):
+            bptt = min(self.bptt, self.data_len - i - 1)
+            _end_idx = i + bptt
+            yield self.get_batch(i, bptt, _end_idx - _eval_cursor)
+            _eval_cursor = _end_idx
+class Corpus(object):
+    def __init__(self, path):
+        self.path = path
+        self.num_words = 0
+        self.tokens = []
+        with open(self.path, "r") as reader:
+            for line in reader:
+                items = json.loads(line.strip())
+                book = items['book']
+                tokens = items['tokens']
+                num_words = items['num_words']
+                self.num_words += num_words
+                self.tokens.extend(tokens)
+class BinLMOrderedIterator(object):
+    def __init__(self, corpus, bsz, bptt, eval_len=None, device='cpu', world_size=1, rank=0):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.corpus = corpus
+        self.bsz = bsz
+        self.world_size = world_size
+        self.rank = rank
+        self.bptt = bptt # tgt_len
+        # existing len.
+        self.eval_len = bptt if eval_len is None else eval_len
+        self.device = device
+        self.global_bsz = bsz * world_size
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = corpus.length // self.global_bsz # bsz
+        self.offset = [(rank * bsz + _b) * self.n_step  for _b in range(bsz)]
+    def __iter__(self):
+        return self.get_fixlen_iter()
+    def get_batch(self, i, bptt, eval_len):
+        # batch_size, lengh;
+        _inputs = []
+        _targets = []
+        for _b in range(0, self.bsz):
+            _input = self.corpus.get_tokens(self.offset[_b] + i, bptt)
+            _target = self.corpus.get_tokens(self.offset[_b] + i + 1, bptt)
+            _inputs.append(_input)
+            _targets.append(_target)
+        _input = torch.tensor(_inputs, dtype=torch.int64, device=self.device).contiguous()
+        _target = torch.tensor(_targets, dtype=torch.int64, device=self.device).contiguous()
+        _msk = torch.cat(
+            [
+                torch.zeros(bptt-eval_len, dtype=torch.float, device=self.device),
+                torch.ones(eval_len, dtype=torch.float, device=self.device)
+            ]
+        )
+        _msk = _msk.unsqueeze(0).expand_as(_input) # .unsqueeze(-1) # length, 1;
+        return _input, _target, _msk
+    def get_fixlen_iter(self, start=0):
+        #self.data_len = self.split_data.size(1)
+        _eval_cursor = 0
+        for i in range(start, self.n_step - 1, self.eval_len):
+            bptt = min(self.bptt, self.n_step - i - 1)
+            _end_idx = i + bptt
+            yield self.get_batch(i, bptt, _end_idx - _eval_cursor)
+            _eval_cursor = _end_idx
+class BinCorpus(object):
+    def __init__(self, path):
+        self.path = path
+        self.book_token_span = []
+        self.book_token_span.append(0)
+        tokens_sum = 0
+        self.num_words = 0
+        with open(path+'.info', 'r') as info_reader:
+            for line in info_reader:
+                items = json.loads(line.strip())
+                book = items['book']
+                num_tokens = items['num_subtokens']
+                num_words = items['num_words']
+                tokens_sum += num_tokens
+                self.book_token_span.append(tokens_sum)
+                self.num_words += num_words
+        self.length = self.book_token_span[-1]
+        self.bin_reader = open(path+'.bin', 'rb')
+    def get_tokens(self, offset, count):
+        INT64_SIZE = 8
+        self.bin_reader.seek(offset * INT64_SIZE)
+        x = np.fromfile(self.bin_reader, count=count, dtype=np.int)
+        return x
+def get_lm_corpus(data):
+    print('Producing dataset {}...'.format(data))
+    corpus = Corpus(data)
+    return corpus
+def padding_tokens(tokens, max_seq_length, pad_token, direct, max_context_length=0):
+    if max_context_length == 0:
+        max_context_length = max_seq_length
+    if len(tokens) > max_context_length:
+        if direct > 0:
+            pad_tokens = tokens[:max_context_length]
+        else:
+            pad_tokens = tokens[-max_context_length:]
+    else:
+        pad_tokens = tokens
+    token_len = len(pad_tokens)
+    pad_tokens = pad_tokens + [pad_token for _ in range(max_seq_length - token_len)]
+    return pad_tokens, token_len
+class FT_Dataset(Dataset):
+    def __init__(self, ft_file, batch_size, max_seq_length,
+                 max_eval_length=0, joint_lm=False, prefix_len=0, infix_len=0,
+                 prefix_cursor=1000000, infix_cursor=2000000):
+        self.ft_file = ft_file
+        self.ft_samples = self.read_ft_file(ft_file)
+        self.batch_size = batch_size
+        self.num_examples = len(self.ft_samples)
+        self.max_seq_length = max_seq_length
+        self.max_eval_length = max_eval_length
+        self.rng = random.Random(911)
+        self.joint_lm = joint_lm
+        self.num_batches = int((self.num_examples + self.batch_size - 1) / self.batch_size)
+        self.prefix_len = prefix_len
+        self.infix_len = infix_len
+        self.prefix_cursor = prefix_cursor
+        self.infix_cursor = infix_cursor
+    def __len__(self):
+        return self.num_batches * self.batch_size
+    def __getitem__(self, item):
+        if(item >= self.num_examples):
+            item = self.rng.randint(0, self.num_examples - 1)
+        example = self.ft_samples[item]
+        context = example[0]
+        completion = example[1]
+        pretokens = [i + self.prefix_cursor for i in range(0, self.prefix_len)]
+        intokens = [i + self.infix_cursor for i in range(0, self.infix_len)]
+        conditions = pretokens + context + intokens
+        _input, _input_len = padding_tokens(conditions + completion, self.max_seq_length, 0, 1)
+        pad_targets = [0 for i in range(0, self.prefix_len)] + context + [0 for i in range(0, self.infix_len)] + completion
+        _target, _ = padding_tokens(pad_targets[1:], self.max_seq_length, 0, 1)
+        if not self.joint_lm:
+            _msk = [0.0] * (len(conditions) - 1) + [1.0] * (_input_len - len(conditions))
+        else:
+            _msk = [1.0] * (_input_len - 1)
+        _msk, _ = padding_tokens(_msk, self.max_seq_length, 0.0, 1)
+        output = {}
+        output["id"] = torch.tensor(item, dtype=torch.long)
+        _query, _query_len = padding_tokens(
+            conditions, self.max_seq_length, 0, -1,
+            max_context_length = self.max_seq_length - self.max_eval_length
+        )
+        output["query"] = torch.tensor(_query, dtype=torch.long)
+        output["query_len"] = torch.tensor(_query_len, dtype=torch.long)
+        output["input"] = torch.tensor(_input, dtype=torch.long)
+        output["target"] = torch.tensor(_target, dtype=torch.long)
+        output["mask"] = torch.tensor(_msk, dtype=torch.float)
+        return output
+    def read_ft_file(self, ft_file):
+        ft_samples = []
+        with open(ft_file, 'r') as reader:
+            for line in reader:
+                items = json.loads(line.strip())
+                context = items['context']
+                completion = items['completion']
+                ft_samples.append([context, completion])
+        return ft_samples
+    def get_item_list(self, start, interval):
+        start = min(start, self.num_examples-1)
+        start = max(0,start)
+        if(start + interval >= self.num_examples):
+            end = self.num_examples
+        else:
+            end = start + interval
+        samples = []
+        for index in range(start, end):
+            output = self.__getitem__(index)
+            samples.append(output)
+        return samples

examples/NLG/src/encoder.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import os
+import json
+import regex as re
+from functools import lru_cache
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class Encoder:
+    def __init__(self, encoder, bpe_merges, errors='replace'):
+        self.encoder = encoder
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        try:
+            import regex as re
+            self.re = re
+        except ImportError:
+            raise ImportError('Please install regex with: pip install regex')
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        tokens = []
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+            if token:
+                tokens.append(token)
+        return bpe_tokens, tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+def get_encoder(models_dir):
+    with open(os.path.join(models_dir, 'encoder.json'), 'r') as f:
+        encoder = json.load(f)
+    with open(os.path.join(models_dir, 'vocab.bpe'), 'r', encoding="utf-8") as f:
+        bpe_data = f.read()
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=bpe_merges,
+    )

examples/NLG/src/exp_utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import functools
+import os, shutil
+import numpy as np
+import torch
+def logging(s, log_path, print_=True, log_=True):
+    if print_:
+        print(s)
+    if log_:
+        with open(log_path, 'a+') as f_log:
+            f_log.write(s + '\n')
+def get_logger(log_path, **kwargs):
+    return functools.partial(logging, log_path=log_path, **kwargs)
+def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
+    if debug:
+        print('Debug Mode : no experiment dir created')
+        return functools.partial(logging, log_path=None, log_=False)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    print('Experiment dir : {}'.format(dir_path))
+    if scripts_to_save is not None:
+        script_path = os.path.join(dir_path, 'scripts')
+        if not os.path.exists(script_path):
+            os.makedirs(script_path)
+        for script in scripts_to_save:
+            dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
+            shutil.copyfile(script, dst_file)
+    return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
+def save_checkpoint(model, optimizer, path, epoch):
+    torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch)))
+    torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch)))

examples/NLG/src/format_converting_dart.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import sys
+import io
+import json
+with open(sys.argv[1], 'r', encoding='utf8') as reader, \
+     open(sys.argv[2], 'w', encoding='utf8') as writer :
+    lines_dict = json.load(reader)
+    full_rela_lst = []
+    full_src_lst = []
+    full_tgt_lst = []
+    unique_src = 0
+    for example in lines_dict:
+        rela_lst = []
+        temp_triples = ''
+        for i, tripleset in enumerate(example['tripleset']):
+            subj, rela, obj = tripleset
+            rela = rela.lower()
+            rela_lst.append(rela)
+            if i > 0:
+                temp_triples += ' | '
+            temp_triples += '{} : {} : {}'.format(subj, rela, obj)
+        unique_src += 1
+        for sent in example['annotations']:
+            full_tgt_lst.append(sent['text'])
+            full_src_lst.append(temp_triples)
+            full_rela_lst.append(rela_lst)
+    print('unique source is', unique_src)
+    for src, tgt in zip(full_src_lst, full_tgt_lst):
+        x = {}
+        x['context'] =  src # context #+ '||'
+        x['completion'] = tgt #completion
+        writer.write(json.dumps(x)+'\n')

examples/NLG/src/format_converting_e2e.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import sys
+import io
+import json
+with open(sys.argv[1], 'r', encoding='utf8') as reader, \
+	 open(sys.argv[2], 'w', encoding='utf8') as writer :
+	for line in reader:
+		items = line.strip().split('||')
+		context = items[0]
+		completion = items[1].strip('\n')
+		x = {}
+		x['context'] = context #+ '||'
+		x['completion'] = completion
+		writer.write(json.dumps(x)+'\n')

examples/NLG/src/format_converting_webnlg.py ADDED Viewed

	@@ -0,0 +1,68 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import sys
+import io
+import json
+with open(sys.argv[1], 'r', encoding='utf8') as reader, \
+     open(sys.argv[2], 'w', encoding='utf8') as writer :
+    lines_dict = json.load(reader)
+    full_rela_lst = []
+    full_src_lst = []
+    full_tgt_lst = []
+    full_cate_lst = []
+    seen = [
+        'Airport',
+        'Astronaut',
+        'Building',
+        'City',
+        'ComicsCharacter',
+        'Food',
+        'Monument',
+        'SportsTeam',
+        'University',
+        'WrittenWork'
+    ]
+    cate_dict = {}
+    for i, example in enumerate(lines_dict['entries']):
+        sents = example[str(i+1)]['lexicalisations']
+        triples = example[str(i + 1)]['modifiedtripleset']
+        cate = example[str(i + 1)]['category']
+        if not cate in cate_dict:
+            cate_dict[cate] = 0
+        cate_dict[cate] += 1
+        rela_lst = []
+        temp_triples = ''
+        for i, tripleset in enumerate(triples):
+            subj, rela, obj = tripleset['subject'], tripleset['property'], tripleset['object']
+            rela_lst.append(rela)
+            if i > 0:
+                temp_triples += ' | '
+            temp_triples += '{} : {} : {}'.format(subj, rela, obj)
+        for sent in sents:
+            if sent["comment"] == 'good':
+                full_tgt_lst.append(sent['lex'])
+                full_src_lst.append(temp_triples)
+                full_rela_lst.append(rela_lst)
+                full_cate_lst.append(cate)
+    for cate in cate_dict:
+        print('cate', cate, cate_dict[cate])
+    #edited_sents = []
+    for src, tgt, cate in zip(full_src_lst, full_tgt_lst, full_cate_lst):
+        x = {}
+        x['context'] =  src # context #+ '||'
+        x['completion'] = tgt #completion
+        x['cate'] = cate in seen
+        writer.write(json.dumps(x)+'\n')

examples/NLG/src/gpt2_beam.py ADDED Viewed

	@@ -0,0 +1,419 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+# python -m torch.distributed.launch --nproc_per_node=1 src/gpt2_beam.py \
+#     --data ./data/e2e/test.jsonl \
+#     --batch_size 1 \
+#     --seq_len 512 \
+#     --eval_len 64 \
+#     --model_card gpt2.md \
+#     --platform local \
+#     --beam 10 \
+#     --length_penalty 0.8 \
+#     --no_repeat_ngram_size 4 \
+#     --repetition_penalty 1.0 \
+#     --eos_token_id 628 \
+#     --lora_dim 4 \
+#     --lora_alpha 32 \
+#     --work_dir ./trained_models/GPT2_M/e2e \
+#     --output_file predict.26290.jsonl \
+#     --init_checkpoint ./trained_models/GPT2_M/e2e/model.26290.pt
+import argparse
+import time
+import math
+import os, sys
+import json
+import itertools
+from typing import Callable, Dict, Iterable, List, Optional, Tuple
+import torch
+from torch import Tensor, device, dtype, nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+torch.set_printoptions(threshold=100000)
+import numpy as np
+from gpu import (
+    add_gpu_params,
+    parse_gpu,
+    distributed_opt,
+    distributed_gather,
+    distributed_sync,
+    cleanup
+)
+from exp_utils import create_exp_dir
+from data_utils import FT_Dataset
+from model import GPT2Config, GPT2LMModel
+parser = argparse.ArgumentParser(description='PyTorch GPT2 beam decoding')
+add_gpu_params(parser)
+parser.add_argument('--data', type=str, default='../data/wikitext-103',
+                    help='location of the data corpus')
+parser.add_argument('--batch_size', type=int, default=10,
+                    help='batch size')
+parser.add_argument('--seq_len', type=int, default=512,
+                    help='number of tokens to predict')
+parser.add_argument('--eval_len', type=int, default=256,
+                    help='evaluation length')
+parser.add_argument('--min_length', type=int, default=0,
+                    help='minimum generation length')
+parser.add_argument('--model_card', default='gpt2.sm', choices=['gpt2.sm', 'gpt2.md', 'gpt2.lg'],
+                    help='model names')
+parser.add_argument('--init_checkpoint', default=None, type=str, help='initial checkpoint')
+parser.add_argument('--lora_dim', type=int, default=0, help='lora attn dimension')
+parser.add_argument('--lora_alpha', type=int, default=128, help='lora attn alpha')
+parser.add_argument('--work_dir', type=str, default=os.getenv('PT_OUTPUT_DIR', 'gpt2_model'),
+                    help='working folder')
+parser.add_argument('--beam', type=int, default=1, help='beam search size')
+parser.add_argument('--length_penalty', type=float, default=1.0, help='length penalty')
+parser.add_argument('--no_repeat_ngram_size', type=int, default=4, help='no_repeat_ngram_size')
+parser.add_argument('--repetition_penalty', type=float, default=1.0, help='repetition_penalty')
+parser.add_argument('--eos_token_id', action='append', type=int, default=[50256],
+                    help='eos token id')
+parser.add_argument('--output_file', type=str, default='beam_prediction.jsonl',
+                    help='output file name')
+def print_args(args):
+    if args.rank == 0:
+        print('=' * 100)
+        for k, v in args.__dict__.items():
+            print('        - {} : {}'.format(k, v))
+        print('=' * 100)
+def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]:
+    return tuple(layer_past.index_select(1, beam_idx).contiguous().detach() for layer_past in past)
+def _calc_banned_ngram_tokens(
+    prev_input_ids: Tensor,
+    num_hypos: int,
+    no_repeat_ngram_size: int,
+    cur_len: int
+) -> None:
+    """Copied from fairseq for no_repeat_ngram in beam_search"""
+    if cur_len + 1 < no_repeat_ngram_size:
+        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        return [[] for _ in range(num_hypos)]
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].tolist()
+        generated_ngram = generated_ngrams[idx]
+        for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+    def _get_generated_ngrams(hypo_idx):
+        # Before decoding the next token, prevent decoding of ngrams that have already appeared
+        start_idx = cur_len + 1 - no_repeat_ngram_size
+        ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist())
+        return generated_ngrams[hypo_idx].get(ngram_idx, [])
+    banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+    return banned_tokens
+def _enforce_repetition_penalty_(
+    lprobs,
+    batch_size,
+    num_beams,
+    prev_output_tokens,
+    repetition_penalty
+):
+    """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+    for i in range(batch_size * num_beams):
+        print('prev_output_tokens.shape', prev_output_tokens.shape)
+        print('prev_output_tokens[i].shape', prev_output_tokens[i].shape)
+        for previous_token in set(prev_output_tokens[i].tolist()):
+            # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+            if lprobs[i, previous_token] < 0:
+                lprobs[i, previous_token] *= repetition_penalty
+            else:
+                lprobs[i, previous_token] /= repetition_penalty
+def _postprocess_next_token_scores(
+    scores,
+    history,
+    cur_len,
+    batch_size,
+    num_beams,
+    repetition_penalty=1.0,
+    no_repeat_ngram_size=4,
+    bad_words_ids=None,
+    min_length=0,
+    max_length=100,
+    eos_token_id=None,
+):
+    # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
+    if repetition_penalty != 1.0 and history is not None:
+        _enforce_repetition_penalty_(scores, batch_size, num_beams, history, repetition_penalty)
+    # score: batch_size * beam, vocab
+    # set eos token prob to zero if min_length is not reached
+    if eos_token_id is not None and cur_len < min_length:
+        for eos in eos_token_id:
+            scores[:, eos] = -float("inf")
+    if no_repeat_ngram_size > 0 and history is not None:
+        # calculate a list of banned tokens to prevent repetitively generating the same ngrams
+        num_batch_hypotheses = batch_size * num_beams
+        # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
+        banned_batch_tokens = _calc_banned_ngram_tokens(
+                history, num_batch_hypotheses, no_repeat_ngram_size, cur_len
+        )
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float("inf")
+    return scores
+def _add_beam_candidate(
+    best_score,
+    best_sequence,
+    batch_size,
+    num_beams,
+    beam_scores,
+    history,
+    eos_token_id=None
+):
+    last_tokens = history[:, -1]
+    for _i in range(batch_size * num_beams):
+        if eos_token_id is None or last_tokens[_i] in eos_token_id:
+            cur_len = history.shape[-1]
+            _score = beam_scores.view(-1)[_i] / cur_len ** args.length_penalty
+            batch_id = _i // num_beams
+            if not batch_id in best_score or best_score[batch_id] < _score:
+                best_score[batch_id] = _score
+                best_sequence[batch_id][:cur_len] = history[_i]
+            beam_scores.view(-1)[_i] = -float("inf")
+def beam(model, data_iter, args):
+    model.eval()
+    total_loss = 0.
+    start_time = time.time()
+    all_predictions = {}
+    with torch.no_grad():
+        for idx, data in enumerate(data_iter):
+            data = {key: value for key, value in data.items()}
+            _id = data['id'].to(args.device)
+            _query = data['query'].to(args.device)
+            _query_len = data['query_len'].to(args.device)
+            ## local adaptation start.
+            ## local adaptation end.
+            output = None
+            score = None
+            batch_size = _id.size(0)
+            num_beams = args.beam
+            length_penalty = args.length_penalty
+            _batch = torch.arange(0, _id.size(0), device=args.device, dtype=torch.long)
+            past = None
+            len_past = None
+            _query = _query.repeat(1, num_beams).view(batch_size * num_beams, -1)
+            _query_len = _query_len.unsqueeze(-1).repeat(1, num_beams).view(-1)
+            _bbatch = _batch.unsqueeze(-1).repeat(1, num_beams).view(-1)
+            # scores for each sentence in the beam
+            beam_scores = torch.zeros(
+                (batch_size, num_beams), dtype=torch.float, device=_query.device
+            )
+            best_sequence = torch.zeros(
+                (batch_size, args.eval_len), dtype=torch.long, device=_query.device
+            )
+            best_score = {}
+            history = None
+            with torch.no_grad():
+                for i in range(0, args.eval_len):
+                    if i == 0:
+                        logits, past = model(_query)
+                        logits = logits[_bbatch, (_query_len-1).long(), :] # batch_size * beam, vocab
+                    else:
+                        #print('token_id.shape', token_id.shape, token_id)
+                        #print('past.shape', past[0].shape)
+                        #print('len_past.shape', len_past.shape, len_past)
+                        logits, past = model(token_id, past=past, len_past=len_past)
+                        logits = logits[:, -1, :]    # batch_size * beam, vocab
+                    logits = _postprocess_next_token_scores(
+                        logits,
+                        history,
+                        i,
+                        batch_size,
+                        num_beams,
+                        repetition_penalty=args.repetition_penalty,
+                        no_repeat_ngram_size=args.no_repeat_ngram_size,
+                        min_length=args.min_length,
+                        eos_token_id=args.eos_token_id,
+                    )
+                    softmax_probs = F.softmax(logits, dim=-1)
+                    ##_prob, _w_idx = torch.topk(softmax_probs, num_beams) # batch_size, beam
+                    vocab_size = softmax_probs.shape[-1]
+                    _logprob = torch.log(softmax_probs) # batch_size * beam, vocab
+                    if i == 0:
+                        next_scores = _logprob.view(batch_size, num_beams, -1)[:, 0, :] # batch_size, vocab
+                    else:
+                        next_scores = beam_scores.unsqueeze(-1) + _logprob.view(batch_size, num_beams, -1)
+                        next_scores = next_scores.view(batch_size, -1) # batch_size, beam * vocab
+                    next_scores, next_tokens = torch.topk(
+                        next_scores, num_beams, dim=1, largest=True, sorted=True
+                    )     # batch_size, num_beams
+                    beam_id = (next_tokens // vocab_size).view(-1)    # batch_size * num_beams
+                    token_id = (next_tokens % vocab_size).view(-1).unsqueeze(-1) # batch_size, num_beams
+                    beam_idx = beam_id.view(batch_size, num_beams) + (_batch * num_beams).unsqueeze(-1)
+                    past = _reorder_cache(past, beam_idx.view(-1))
+                    beam_scores = next_scores # batch_size, num_beams
+                    len_past = (_query_len + i).long()
+                    if history is None:
+                        history = token_id.detach()
+                    else:
+                        history = torch.cat((history[beam_idx.view(-1)], token_id.detach()), dim=1).detach()
+                    _add_beam_candidate(
+                        best_score, best_sequence, batch_size, num_beams, beam_scores, history,
+                        eos_token_id=args.eos_token_id
+                    )
+                _add_beam_candidate(
+                    best_score, best_sequence, batch_size, num_beams, beam_scores, history
+                )
+            with torch.no_grad():
+                _id = distributed_gather(args, _id)
+                output = distributed_gather(args, best_sequence)
+                #score = distributed_gather(args, score)
+                distributed_sync(args)
+            if args.rank == 0:
+                _id = _id.view(-1).cpu()
+                output = output.view(-1, output.shape[-1]).cpu()
+                #score = score.view(-1, score.shape[-1]).cpu()
+                for _b in range(0, _id.shape[-1]):
+                    _i = int(_id[_b].item())
+                    all_predictions[_i] = {}
+                    all_predictions[_i]['id'] = _i
+                    all_predictions[_i]['predict'] = output[_b].tolist()
+                    #all_predictions[_i]['score'] = score[_b].tolist()
+                if idx % 10 == 0:
+                    print('inference samples', idx)
+                    # pred_file = os.path.join(args.work_dir, args.output_file)
+                    # print('saving prediction file', pred_file)
+                    # with open(pred_file, 'w') as writer:
+                    #     for _i in all_predictions:
+                    #         writer.write(json.dumps(all_predictions[_i]) + '\n')
+    if args.rank == 0:
+        pred_file = os.path.join(args.work_dir, args.output_file)
+        print('saving prediction file', pred_file)
+        with open(pred_file, 'w') as writer:
+            for _i in all_predictions:
+                writer.write(json.dumps(all_predictions[_i]) + '\n')
+if __name__ == '__main__':
+    args = parser.parse_args()
+    parse_gpu(args)
+    print_args(args)
+    if args.rank == 0:
+        args.logging = create_exp_dir(args.work_dir)
+    valid_data = FT_Dataset(
+        args.data, args.batch_size, args.seq_len, args.eval_len,
+    )
+    valid_data = valid_data.get_item_list(0, 1000)
+    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_data)
+    valid_loader = DataLoader(
+        valid_data, batch_size=args.batch_size, num_workers=0, shuffle=False,
+        pin_memory=False, drop_last=False, sampler=valid_sampler
+    )
+    if args.model_card == 'gpt2.sm':
+        config = GPT2Config(
+            n_embd=768, n_layer=12, n_head=12,
+            lora_attn_dim=args.lora_dim, lora_attn_alpha=args.lora_alpha,
+        )
+    elif args.model_card == 'gpt2.md':
+        config = GPT2Config(
+            n_embd=1024, n_layer=24, n_head=16,
+            lora_attn_dim=args.lora_dim, lora_attn_alpha=args.lora_alpha,
+        )
+    elif args.model_card == 'gpt2.lg':
+        config = GPT2Config(
+            n_embd=1280, n_layer=36, n_head=20,
+            lora_attn_dim=args.lora_dim, lora_attn_alpha=args.lora_alpha,
+        )
+    lm_net = GPT2LMModel(config)
+    if args.init_checkpoint is not None:
+        print('loading model pretrained weight.')
+        cp = torch.load(args.init_checkpoint, map_location=torch.device('cpu'))
+        lm_net.load_weight(cp)
+    lm_net = lm_net.cuda()
+    print(lm_net.transformer.h[0].mlp)
+    print('model sampling ...')
+    beam(lm_net, valid_loader, args)
+    distributed_sync(args)
+    print('cleanup dist ...')
+    cleanup(args)

examples/NLG/src/gpt2_decode.py ADDED Viewed

	@@ -0,0 +1,187 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+# python -m torch.distributed.launch --nproc_per_node=1 src/gpt2_beam.py \
+#     --data ./data/e2e/test.jsonl \
+#     --batch_size 1 \
+#     --seq_len 512 \
+#     --eval_len 64 \
+#     --model_card gpt2.md \
+#     --platform local \
+#     --beam 10 \
+#     --length_penalty 0.8 \
+#     --no_repeat_ngram_size 4 \
+#     --repetition_penalty 1.0 \
+#     --eos_token_id 628 \
+#     --lora_dim 4 \
+#     --lora_alpha 32 \
+#     --work_dir ./trained_models/GPT2_M/e2e \
+#     --output_file predict.26290.jsonl \
+#     --init_checkpoint ./trained_models/GPT2_M/e2e/model.26290.pt
+import json
+import numpy as np
+import argparse
+import os
+import sys
+import re
+import json
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+import torch.utils.data
+import encoder
+parser = argparse.ArgumentParser()
+parser.add_argument('--vocab', type=str, default=None, help='vocab path')
+parser.add_argument('--sample_file', default=None, type=str, help='ft sample file')
+parser.add_argument('--input_file', default=None, type=str, help='ft input file')
+parser.add_argument('--output_ref_file', default=None, type=str, help='output reference file')
+parser.add_argument('--output_pred_file', default=None, type=str, help='output predicion file')
+parser.add_argument('--ref_unique_file', default=None, type=str, help='reference unique id file')
+parser.add_argument('--ref_type', default='e2e', choices=['e2e', 'webnlg', 'dart'],
+                    help='e2e style reference type; webnlg style reference type.')
+parser.add_argument('--ref_num', default=4, type=int, help='number of references.')
+parser.add_argument('--tokenize', action='store_true', help='')
+parser.add_argument('--lower', action='store_true', help='')
+parser.add_argument('--filter', default='all', choices=['all', 'seen', 'unseen'],
+                    help='for webnlg only, filter categories that are seen during training, unseen, or all')
+args = parser.parse_args()
+def stardard_tokenize(sent):
+    sent = ' '.join(re.split('(\W)', sent))
+    sent = sent.split()
+    sent = ' '.join(sent)
+    return sent
+def post_process(sent, is_tokenize, is_lower):
+    if is_lower:
+        sent = sent.lower()
+    if is_tokenize:
+        sent = stardard_tokenize(sent)
+    return sent
+if __name__ == "__main__":
+    enc = encoder.get_encoder(args.vocab)
+    ref_unique = None
+    if args.ref_unique_file is not None:
+        print('reading ref_unique_file.')
+        ref_unique = []
+        uniques = {}
+        with open(args.ref_unique_file, 'r') as ref_unique_reader:
+            for line in ref_unique_reader:
+                _id = int(line.strip())
+                ref_unique.append(_id)
+                uniques[_id] = 1
+        print('len refer dict', len(ref_unique), 'unique', len(uniques))
+    with open(args.sample_file, 'r') as sample_reader, \
+             open(args.input_file, 'r', encoding='utf8') as input_reader, \
+             open(args.output_pred_file, 'w', encoding='utf8') as pred_writer:
+        refer_dict = {}
+        context_list = []
+        line_id = 0
+        for line in input_reader:
+            items = json.loads(line.strip())
+            context = items['context']
+            completion = items['completion']
+            context_list.append(context)
+            keep = False
+            if args.filter == 'all':
+                keep = True
+            if args.filter == 'seen' and items['cate']:
+                keep = True
+            if args.filter == 'unseen' and not items['cate']:
+                keep = True
+            if ref_unique is None:
+                _key = context
+            else:
+                _key = ref_unique[line_id]
+            if keep:
+                if not _key in refer_dict:
+                    refer_dict[_key] = {}
+                    refer_dict[_key]['references'] = []
+                refer_dict[_key]['references'].append(completion.split('<|endoftext|>')[0].split('\n\n')[0].strip())
+            line_id += 1
+            if line_id==1000:
+                break
+        print('unique refer dict', len(refer_dict))
+        for line in sample_reader:
+            items = json.loads(line.strip())
+            _id = items['id']
+            _pred_tokens = items['predict']
+            if ref_unique is None:
+                _key = context_list[_id]
+            else:
+                _key = ref_unique[_id]
+            #assert _key in refer_dict
+            # if _key in refer_dict:
+            if not _key in refer_dict:
+                refer_dict[_key] = {}
+                refer_dict[_key]['sample'] = []
+            refer_dict[_key]['sample'] = enc.decode(_pred_tokens).split('<|endoftext|>')[0].split('\n\n')[0].strip()
+        references = [refer_dict[s]['references'] for s in refer_dict]
+        hypothesis = [refer_dict[s]['sample'] for s in refer_dict]
+        if args.ref_type == 'e2e':
+            with open(args.output_ref_file, 'w', encoding='utf8') as ref_writer:
+                for ref, hyp in zip(references, hypothesis):
+                    for r in ref:
+                        ref_writer.write(post_process(r, args.tokenize, args.lower) + '\n')
+                    ref_writer.write('\n')
+                    pred_writer.write(post_process(hyp, args.tokenize, args.lower) + '\n')
+        elif args.ref_type in ['webnlg', 'dart']:
+            if not os.path.exists(args.output_ref_file):
+                os.makedirs(args.output_ref_file)
+            reference_writers = [
+                open(os.path.join(args.output_ref_file, f'reference{fid}'), 'w', encoding='utf8')
+                for fid in range(0, args.ref_num)
+            ]
+            for ref, hyp in zip(references, hypothesis):
+                for fid in range(0, args.ref_num):
+                    if len(ref) > fid:
+                        reference_writers[fid].write(post_process(ref[fid], args.tokenize, args.lower) + '\n')
+                    else:
+                        reference_writers[fid].write(post_process(ref[0], args.tokenize, args.lower) + '\n')
+                pred_writer.write(post_process(hyp, args.tokenize, args.lower) + '\n')
+            for writer in reference_writers:
+                writer.close()

examples/NLG/src/gpt2_encode.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import json
+import numpy as np
+import encoder
+import argparse
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+import torch.utils.data
+import numpy
+import io
+import sys
+import threading
+import math
+import random
+import json
+import collections
+from collections import Counter
+from collections import OrderedDict
+from progress.bar import Bar as Bar
+parser = argparse.ArgumentParser()
+parser.add_argument('--input', default=None, type=str, help='ft input file')
+parser.add_argument('--vocab', type=str, default=None, help='vocab path')
+parser.add_argument('--output', default=None, type=str, help='ft output file')
+parser.add_argument('--add_bos', action='store_true', help='')
+parser.add_argument('--add_eos', action='store_true', help='')
+args = parser.parse_args()
+if __name__ == "__main__":
+    enc = encoder.get_encoder(args.vocab)
+    writer = open(args.output, 'w')
+    with open(args.input, 'r') as reader:
+        line_idx = 0
+        for line in reader:
+            items = json.loads(line.strip())
+            context = items['context']
+            completion = items['completion']
+            bos = 50256
+            eos = 50256
+            context_bpes, _ = enc.encode(context)
+            context_bpes += [bos] if args.add_bos else []
+            completion_bpes, _ = enc.encode(' ' + completion)
+            completion_bpes += [eos] if args.add_eos else []
+            ft_json = {}
+            ft_json['context'] = context_bpes
+            ft_json['completion'] = completion_bpes
+            writer.write(json.dumps(ft_json)+'\n')
+            line_idx += 1
+    writer.close()

examples/NLG/src/gpt2_ft.py ADDED Viewed

	@@ -0,0 +1,385 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import argparse
+import time
+import math
+import os, sys
+import numpy as np
+import itertools
+import torch
+import random
+from torch.utils.data import DataLoader
+torch.set_printoptions(threshold=100000)
+from gpu import (
+    add_gpu_params,
+    parse_gpu,
+    distributed_opt,
+    distributed_gather,
+    distributed_sync,
+    cleanup
+)
+from optimizer import (
+    create_adam_optimizer,
+    create_optimizer_scheduler,
+    add_optimizer_params,
+    create_adam_optimizer_from_args
+)
+from data_utils import FT_Dataset
+from model import GPT2Config, GPT2LMModel
+from exp_utils import create_exp_dir
+import loralib as lora
+parser = argparse.ArgumentParser(description='PyTorch GPT2 ft script')
+add_gpu_params(parser)
+add_optimizer_params(parser)
+parser.add_argument('--train_data', required=True, help='location of training data corpus')
+parser.add_argument('--valid_data', required=True, help='location of validation data corpus')
+parser.add_argument('--train_batch_size', type=int, default=8, help='training batch size')
+parser.add_argument('--valid_batch_size', type=int, default=4, help='validation batch size')
+parser.add_argument('--grad_acc', type=int, default=1, help='gradient accumulation steps')
+parser.add_argument('--clip', type=float, default=0.0, help='gradient clip')
+parser.add_argument('--seq_len', type=int, default=512, help='number of tokens to predict.')
+parser.add_argument('--model_card', default='gpt2.md', choices=['gpt2.sm', 'gpt2.md', 'gpt2.lg'],
+                    help='model names')
+parser.add_argument('--init_checkpoint', default=None, help='pretrained checkpoint path')
+parser.add_argument('--fp16', action='store_true', help='train model with fp16')
+parser.add_argument('--log_interval', type=int, default=100, help='log interval')
+parser.add_argument('--eval_interval', type=int, default=2000, help='eval interval')
+parser.add_argument('--save_interval', type=int, default=500, help='save interval')
+parser.add_argument('--work_dir', type=str, default=os.getenv('PT_OUTPUT_DIR', 'gpt2_model'),
+                    help='working folder.')
+parser.add_argument('--lora_dim', type=int, default=0, help='lora attn dimension')
+parser.add_argument('--lora_alpha', type=int, default=128, help='lora attn alpha')
+parser.add_argument('--obj', default='clm', choices=['jlm', 'clm'],
+                    help='language model training objective')
+parser.add_argument('--lora_dropout', default=0.0, type=float,
+                    help='dropout probability for lora layers')
+parser.add_argument('--label_smooth', default=0.0, type=float, help='label smoothing')
+parser.add_argument('--roll_interval', type=int, default=-1, help='rolling interval')
+parser.add_argument('--roll_lr', type=float, default=0.00001, help='rolling learning rate')
+parser.add_argument('--roll_step', type=int, default=100, help='rolling step')
+parser.add_argument('--eval_epoch', type=int, default=1, help='eval per number of epochs')
+# influence model, calculate the influence score between two samples.
+def print_args(args):
+    if args.rank == 0:
+        print('=' * 100)
+        for k, v in args.__dict__.items():
+            print(f'        - {k} : {v}')
+        print('=' * 100)
+class AverageMeter(object):
+    """Computes and stores the average and current value
+         Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def optimizer_step(_loss, _optimizer, _model, _schedule, args, is_update=True):
+    if args.fp16:
+        with amp.scale_loss(_loss, _optimizer) as _scaled_loss:
+            _scaled_loss.backward()
+    else:
+        _loss.backward()
+    # for name, param in _model.named_parameters():
+    #     if param.requires_grad and param.grad is not None:
+    #         print(f"Parameter name: {name}")
+    #         print(f"Gradient value: {param.grad}")
+    if is_update:
+        if args.clip > 0:
+            if args.fp16:
+                torch.nn.utils.clip_grad_norm_(amp.master_params(_optimizer), args.clip)
+            else:
+                torch.nn.utils.clip_grad_norm_(_model.parameters(), args.clip)
+        _optimizer.step()
+        _optimizer.zero_grad()
+    if _schedule is not None:
+        _schedule.step()
+    # print(f"query[0].lora_B = {_model.module.transformer.h[0].attn.c_attn.lora_B}")
+def evaluate(model, valid_loader, args):
+    model.eval()
+    total_loss = 0.
+    start_time = time.time()
+    avg_lm_loss = AverageMeter()
+    with torch.no_grad():
+        for idx, data in enumerate(valid_loader):
+            data = {key: value for key, value in data.items()}
+            _input = data['input'].to(args.device)
+            _target = data['target'].to(args.device)
+            _msk = data['mask'].to(args.device)
+            _lm_logits, _loss = model(_input, lm_labels=_target, lm_mask=_msk)
+            loss = _loss.mean()
+            # print(f"logits={_lm_logits}, _loss={_loss}")
+            avg_lm_loss.update(loss.item())
+            if idx % 100 == 0:
+                print('eval samples:', idx, 'loss:', loss.float())
+        total_time = time.time() - start_time
+        print('average loss', avg_lm_loss.avg)
+    return avg_lm_loss.avg, math.exp(avg_lm_loss.avg)
+def train_validate(
+    model,
+    optimizer,
+    scheduler,
+    train_loader,
+    valid_loader,
+    args,
+    train_step=0,
+    epoch=0
+):
+    model.train()
+    avg_lm_loss = AverageMeter()
+    print('start to train the model................', epoch)
+    log_start_time = time.time()
+    best_val_ppl = None
+    # train_loader.sampler.set_epoch(epoch)
+    for idx, data in enumerate(train_loader):
+        data = {key: value for key, value in data.items()}
+        _input = data['input'].to(args.device)
+        _target = data['target'].to(args.device)
+        _msk = data['mask'].to(args.device)
+        _lm_logits, _lm_loss = model(
+            _input, lm_labels=_target, lm_mask=_msk, label_smooth=args.label_smooth
+        )
+        # print(_input[0])
+        _lm_loss = _lm_loss.mean()
+        train_step += 1
+        is_update = True if train_step % args.grad_acc == 0 else False
+        avg_lm_loss.update(_lm_loss.item())
+        optimizer_step(
+            _lm_loss/(args.grad_acc), optimizer, model, scheduler, args, is_update=is_update
+        )
+        if train_step % args.log_interval == 0:
+            print(f"_lm_loss = {_lm_loss}")
+            print(f"layer[0].lora_A = {model.module.transformer.h[0].attn.c_attn.lora_A[0,:100]}")
+            elapsed = time.time() - log_start_time
+            lr = optimizer.param_groups[0]['lr']
+            log_str = f'| epoch {epoch:3d} step {train_step:>8d} | { idx + 1:>6d} batches | ' \
+                      f'lr {lr:.3g} | ms/batch {elapsed * 1000 / args.log_interval:5.2f} | ' \
+                      f'loss {avg_lm_loss.val:5.2f} | avg loss {avg_lm_loss.avg:5.2f} | ' \
+                      f'ppl {math.exp(avg_lm_loss.avg):5.2f}'
+            if args.rank == 0:
+                print(log_str)
+            log_start_time = time.time()
+            avg_lm_loss.reset()
+        if train_step % args.save_interval == 0:
+            if args.rank == 0:
+                model_path = os.path.join(args.work_dir, f'model.{train_step}.pt')
+                print('saving checkpoint', model_path)
+                torch.save({'model_state_dict': lora.lora_state_dict(model)}, model_path)
+            distributed_sync(args)
+        # evaluation interval
+        if train_step % args.eval_interval == 0:
+            eval_start_time = time.time()
+            valid_loss, valid_ppl = evaluate(model, valid_loader, args)
+            if best_val_ppl is None or valid_ppl < best_val_ppl:
+                best_val_ppl = valid_ppl
+            log_str = f'| Eval {train_step // args.eval_interval:3d} at step {train_step:>8d} | ' \
+                      f'time: {time.time() - eval_start_time:5.2f}s | valid loss {valid_loss:5.2f} | ' \
+                      f'valid ppl {valid_ppl:5.2f} | best ppl {best_val_ppl:5.2f} '
+            if args.rank == 0:
+                print('-' * 100)
+                print(log_str)
+                print('-' * 100)
+            model.train()
+            distributed_sync(args)
+        if train_step == args.max_step:
+            break
+    if args.rank == 0:
+        model_path = os.path.join(args.work_dir, f'model.{train_step}.pt')
+        print('saving checkpoint', model_path)
+        torch.save({'model_state_dict': model.state_dict()}, model_path)
+    distributed_sync(args)
+    return train_step
+if __name__ == '__main__':
+    args = parser.parse_args()
+    parse_gpu(args)
+    print_args(args)
+    if args.fp16:
+        try:
+            from apex import amp
+        except Exception as e:
+            warnings.warn('Could not import amp, apex may not be installed')
+    torch.manual_seed(args.random_seed)
+    random.seed(args.random_seed)
+    if args.rank == 0:
+        args.logging = create_exp_dir(args.work_dir)
+    train_data = FT_Dataset(
+        args.train_data, args.train_batch_size, args.seq_len,
+        joint_lm=args.obj=='jlm'
+    )
+    valid_data = FT_Dataset(
+        args.valid_data, args.valid_batch_size, args.seq_len,
+    )
+    train_loader = DataLoader(
+        train_data, batch_size=args.train_batch_size, num_workers=0,
+        shuffle=False, pin_memory=False, drop_last=True,
+        # sampler=torch.utils.data.distributed.DistributedSampler(train_data, seed=args.random_seed)
+    )
+    valid_loader = DataLoader(
+        valid_data, batch_size=args.valid_batch_size, num_workers=0,
+        shuffle=False, pin_memory=False, drop_last=False,
+        # sampler=torch.utils.data.distributed.DistributedSampler(valid_data, seed=args.random_seed)
+    )
+    print(f"train_loader={len(train_loader)}, train_data={len(train_data)}")
+    print(f"valid_loader={len(valid_loader)}, valid_data={len(valid_data)}")
+    if args.model_card == 'gpt2.sm':
+        config = GPT2Config(
+            n_embd=768, n_layer=12, n_head=12,
+            lora_attn_dim=args.lora_dim,
+            lora_attn_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+        )
+    elif args.model_card == 'gpt2.md':
+        config = GPT2Config(
+            n_embd=1024, n_layer=24, n_head=16,
+            lora_attn_dim=args.lora_dim,
+            lora_attn_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+        )
+    elif args.model_card == 'gpt2.lg':
+        config = GPT2Config(
+            n_embd=1280, n_layer=36, n_head=20,
+            lora_attn_dim=args.lora_dim,
+            lora_attn_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+        )
+    lm_net = GPT2LMModel(config)
+    if args.init_checkpoint is not None:
+        print('loading model pretrained weight.')
+        lm_net.load_weight(torch.load(args.init_checkpoint))
+    lm_net = lm_net.cuda()
+    if args.lora_dim > 0:
+        lora.mark_only_lora_as_trainable(lm_net)
+    print(lm_net)
+    print(lm_net.transformer.h[0].attn.c_attn.weight.shape)
+    print(lm_net.transformer.h[0].attn.c_attn.lora_A.shape)
+    print(lm_net.transformer.h[0].attn.c_attn.lora_B.shape)
+    config_dict = vars(config)
+    for param, value in config_dict.items():
+        print(f"{param}: {value}")
+    print(args)
+    optimizer = create_adam_optimizer_from_args(lm_net, args)
+    print("optimizer: " + str(optimizer))
+    if args.max_step is None:
+        args.max_step = (args.max_epoch * train_data.num_batches + args.world_size - 1) // args.world_size
+        print('set max_step:', args.max_step)
+        print('train_data.num_batches:', train_data.num_batches)
+    scheduler = create_optimizer_scheduler(optimizer, args)
+    if args.fp16:
+        lm_net, optimizer = amp.initialize(lm_net, optimizer, opt_level="O1")
+    lm_net, optimizer = distributed_opt(args, lm_net, optimizer, grad_acc=args.grad_acc)
+    try:
+        train_step = 0
+        for epoch in itertools.count(start=1):
+            train_step = train_validate(
+                lm_net, optimizer, scheduler, train_loader, valid_loader, args,
+                train_step=train_step, epoch=epoch
+            )
+            if train_step >= args.max_step or (args.max_epoch is not None and epoch >= args.max_epoch):
+                if args.rank == 0:
+                    print('-' * 100)
+                    print('End of training')
+                break
+    except KeyboardInterrupt:
+        if args.rank == 0:
+            print('-' * 100)
+            print('Exiting from training early')
+    distributed_sync(args)
+    print('cleanup dist ...')
+    cleanup(args)

examples/NLG/src/gpu.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import argparse
+import time
+import math
+import os, sys
+import itertools
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.distributed as dist
+gpu_offset = 4     # 0
+def add_gpu_params(parser: argparse.ArgumentParser):
+    parser.add_argument("--platform", default='k8s', type=str, help='platform cloud')
+    parser.add_argument("--local_rank", default=0, type=int, help='local rank')
+    parser.add_argument("--rank", default=0, type=int, help='rank')
+    parser.add_argument("--device", default=0, type=int, help='device')
+    parser.add_argument("--world_size", default=0, type=int, help='world size')
+    parser.add_argument("--random_seed", default=10, type=int, help='random seed')
+def distributed_opt(args, model, opt, grad_acc=1):
+    if args.platform == 'azure':
+        args.hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+        opt = args.hvd.DistributedOptimizer(
+            opt, named_parameters=model.named_parameters(), backward_passes_per_step=grad_acc
+        )
+    elif args.platform == 'philly' or args.platform == 'k8s' or args.platform == 'local':
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank+gpu_offset], output_device=args.local_rank+gpu_offset,   # change
+            find_unused_parameters=False, broadcast_buffers=False
+        )
+    return model, opt
+def distributed_gather(args, tensor):
+    g_y = [torch.zeros_like(tensor) for _ in range(args.world_size)]
+    torch.distributed.all_gather(g_y, tensor, async_op=False)
+    return torch.stack(g_y)
+def distributed_sync(args):
+    if args.platform == 'azure':
+        args.hvd.allreduce(torch.tensor(0), name='barrier')
+    else:
+        args.dist.barrier()
+def parse_gpu(args):
+    torch.manual_seed(args.random_seed)
+    if args.platform == 'local':
+        dist.init_process_group(backend='nccl')
+        local_rank = torch.distributed.get_rank()
+        torch.cuda.set_device(local_rank+gpu_offset)          # change
+        device = torch.device('cuda', local_rank+gpu_offset)  # change
+        args.rank = local_rank
+        args.device = device
+        args.world_size = torch.distributed.get_world_size()
+        args.dist = dist
+    elif args.platform == 'azure':
+        import horovod.torch as hvd
+        hvd.init()
+        print('azure hvd rank', hvd.rank(), 'local rank', hvd.local_rank())
+        local_rank = hvd.local_rank()
+        torch.cuda.set_device(local_rank)
+        device = torch.device('cuda', local_rank)
+        rank = hvd.rank()
+        world_size = hvd.size()
+        args.local_rank = local_rank
+        args.rank = rank
+        args.device = device
+        args.world_size = world_size
+        args.hvd = hvd
+    elif args.platform == 'philly':
+        local_rank = args.local_rank
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(backend='nccl')
+        rank = dist.get_rank()
+        world_size = torch.distributed.get_world_size()
+        device = torch.device('cuda', local_rank)
+        args.rank = rank
+        args.device = device
+        args.world_size = world_size
+        args.dist = dist
+    elif args.platform == 'k8s':
+        master_uri = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
+        local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.local_rank = local_rank
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        rank = world_rank
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+                backend='nccl',
+                init_method=master_uri,
+                world_size=world_size,
+                rank=world_rank,
+        )
+        device = torch.device("cuda", local_rank)
+        args.rank = rank
+        args.device = device
+        args.world_size = world_size
+        args.dist = dist
+    print(
+        'myrank:', args.rank,
+        'local_rank:', args.local_rank,
+        'device_count:', torch.cuda.device_count(),
+        'world_size:', args.world_size,
+        'device:', device
+    )
+def cleanup(args):
+    if args.platform == 'k8s' or args.platform == 'philly':
+        args.dist.destroy_process_group()

examples/NLG/src/model.log ADDED Viewed

	@@ -0,0 +1,698 @@

+myrank: 0 local_rank: 0 device_count: 8 world_size: 1 device: cuda:4
+====================================================================================================
+        - platform : local
+        - local_rank : 0
+        - rank : 0
+        - device : cuda:4
+        - world_size : 1
+        - random_seed : 110
+        - lr : 0.0002
+        - weight_decay : 0.01
+        - correct_bias : True
+        - adam_epislon : 1e-06
+        - no_decay_bias : False
+        - adam_beta1 : 0.9
+        - adam_beta2 : 0.999
+        - scheduler : linear
+        - max_step : None
+        - max_epoch : 5
+        - warmup_step : 500
+        - i_steps : 0
+        - i_lrs : 0.00025
+        - train_data : ./data/e2e/train.jsonl
+        - valid_data : ./data/e2e/valid.jsonl
+        - train_batch_size : 8
+        - valid_batch_size : 4
+        - grad_acc : 1
+        - clip : 0.0
+        - seq_len : 512
+        - model_card : gpt2.md
+        - init_checkpoint : ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin
+        - fp16 : False
+        - log_interval : 100
+        - eval_interval : 2000
+        - save_interval : 1000
+        - work_dir : ./trained_models/GPT2_M/e2e
+        - lora_dim : 4
+        - lora_alpha : 32
+        - obj : clm
+        - lora_dropout : 0.1
+        - label_smooth : 0.1
+        - roll_interval : -1
+        - roll_lr : 1e-05
+        - roll_step : 100
+        - eval_epoch : 1
+        - dist : <module 'torch.distributed' from '/home/inc/miniconda3/envs/fedadp-new/lib/python3.7/site-packages/torch/distributed/__init__.py'>
+====================================================================================================
+Experiment dir : ./trained_models/GPT2_M/e2e
+train_loader=5258, train_data=42064
+valid_loader=1168, valid_data=4672
+scaling = 8.0
+loading model pretrained weight.
+GPT2LMModel(
+  (transformer): GPT2Model(
+    (wte): Embedding(50257, 1024)
+    (wpe): Embedding(1024, 1024)
+    (h): ModuleList(
+      (0): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (1): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (2): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (3): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (4): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (5): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (6): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (7): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (8): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (9): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (10): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (11): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (12): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (13): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (14): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (15): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (16): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (17): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (18): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (19): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (20): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (21): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (22): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+      (23): Block(
+        (ln_1): LayerNorm()
+        (attn): Attention(
+          (c_attn): MergedLinear(
+            in_features=1024, out_features=3072, bias=True
+            (lora_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (c_proj): Conv1D()
+        )
+        (ln_2): LayerNorm()
+        (mlp): MLP(
+          (c_fc): Conv1D()
+          (c_proj): Conv1D()
+        )
+      )
+    )
+    (ln_f): LayerNorm()
+  )
+  (lm_head): GPT2LMHead(
+    (decoder): Linear(in_features=1024, out_features=50257, bias=False)
+  )
+)
+vocab_size: 50257
+n_ctx: 1024
+n_positions: 1024
+n_embd: 1024
+n_layer: 24
+n_head: 16
+layer_norm_epsilon: 1e-05
+initializer_range: 0.02
+lora_attn_dim: 4
+lora_attn_alpha: 32
+lora_dropout: 0.1
+lora_r_dropout: 0.0
+fix_dropout: 0.0
+Namespace(adam_beta1=0.9, adam_beta2=0.999, adam_epislon=1e-06, clip=0.0, correct_bias=True, device=device(type='cuda', index=4), dist=<module 'torch.distributed' from '/home/inc/miniconda3/envs/fedadp-new/lib/python3.7/site-packages/torch/distributed/__init__.py'>, eval_epoch=1, eval_interval=2000, fp16=False, grad_acc=1, i_lrs='0.00025', i_steps='0', init_checkpoint='./pretrained_checkpoints/gpt2-medium-pytorch_model.bin', label_smooth=0.1, local_rank=0, log_interval=100, logging=functools.partial(<function logging at 0x7f90cac2ae60>, log_path='./trained_models/GPT2_M/e2e/log.txt'), lora_alpha=32, lora_dim=4, lora_dropout=0.1, lr=0.0002, max_epoch=5, max_step=None, model_card='gpt2.md', no_decay_bias=False, obj='clm', platform='local', random_seed=110, rank=0, roll_interval=-1, roll_lr=1e-05, roll_step=100, save_interval=1000, scheduler='linear', seq_len=512, train_batch_size=8, train_data='./data/e2e/train.jsonl', valid_batch_size=4, valid_data='./data/e2e/valid.jsonl', warmup_step=500, weight_decay=0.01, work_dir='./trained_models/GPT2_M/e2e', world_size=1)
+optimizer: AdamW (
+Parameter Group 0
+    betas: (0.9, 0.999)
+    correct_bias: True
+    eps: 1e-06
+    lr: 0.0002
+    weight_decay: 0.01
+)
+set max_step: 26290
+train_data.num_batches: 5258
+start to train the model................ 1
+/home/inc/Documents/fzh/python/LoRA-main/examples/NLG/src/optimizer.py:117: UserWarning: This overload of addcdiv_ is deprecated:
+        addcdiv_(Number value, Tensor tensor1, Tensor tensor2)
+Consider using one of the following signatures instead:
+        addcdiv_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1050.)
+  p.data.addcdiv_(-step_size, exp_avg, denom)
+| epoch   1 step      100 |    100 batches | lr 4e-05 | ms/batch 612.69 | loss  5.06 | avg loss  5.52 | ppl 250.72
+| epoch   1 step      200 |    200 batches | lr 8e-05 | ms/batch 608.52 | loss  3.21 | avg loss  3.70 | ppl 40.58
+| epoch   1 step      300 |    300 batches | lr 0.00012 | ms/batch 609.77 | loss  2.98 | avg loss  3.08 | ppl 21.74
+| epoch   1 step      400 |    400 batches | lr 0.00016 | ms/batch 610.18 | loss  3.11 | avg loss  2.98 | ppl 19.63
+| epoch   1 step      500 |    500 batches | lr 0.0002 | ms/batch 610.03 | loss  2.84 | avg loss  2.89 | ppl 18.03
+| epoch   1 step      600 |    600 batches | lr 0.000199 | ms/batch 608.84 | loss  2.77 | avg loss  2.83 | ppl 16.93
+| epoch   1 step      700 |    700 batches | lr 0.000198 | ms/batch 611.37 | loss  2.88 | avg loss  2.80 | ppl 16.37
+| epoch   1 step      800 |    800 batches | lr 0.000198 | ms/batch 611.10 | loss  2.48 | avg loss  2.76 | ppl 15.76
+| epoch   1 step      900 |    900 batches | lr 0.000197 | ms/batch 610.61 | loss  2.50 | avg loss  2.75 | ppl 15.59
+| epoch   1 step     1000 |   1000 batches | lr 0.000196 | ms/batch 610.44 | loss  3.19 | avg loss  2.77 | ppl 15.95
+saving checkpoint ./trained_models/GPT2_M/e2e/model.1000.pt
+| epoch   1 step     1100 |   1100 batches | lr 0.000195 | ms/batch 612.14 | loss  2.76 | avg loss  2.73 | ppl 15.41
+| epoch   1 step     1200 |   1200 batches | lr 0.000195 | ms/batch 608.16 | loss  3.02 | avg loss  2.76 | ppl 15.84
+| epoch   1 step     1300 |   1300 batches | lr 0.000194 | ms/batch 610.06 | loss  2.55 | avg loss  2.75 | ppl 15.62
+| epoch   1 step     1400 |   1400 batches | lr 0.000193 | ms/batch 609.24 | loss  2.35 | avg loss  2.70 | ppl 14.93
+| epoch   1 step     1500 |   1500 batches | lr 0.000192 | ms/batch 607.91 | loss  2.53 | avg loss  2.72 | ppl 15.24
+| epoch   1 step     1600 |   1600 batches | lr 0.000191 | ms/batch 608.62 | loss  2.53 | avg loss  2.67 | ppl 14.50
+| epoch   1 step     1700 |   1700 batches | lr 0.000191 | ms/batch 608.92 | loss  2.66 | avg loss  2.71 | ppl 14.99
+| epoch   1 step     1800 |   1800 batches | lr 0.00019 | ms/batch 608.44 | loss  2.55 | avg loss  2.69 | ppl 14.75
+| epoch   1 step     1900 |   1900 batches | lr 0.000189 | ms/batch 609.27 | loss  2.43 | avg loss  2.66 | ppl 14.31
+| epoch   1 step     2000 |   2000 batches | lr 0.000188 | ms/batch 607.05 | loss  2.71 | avg loss  2.66 | ppl 14.36
+saving checkpoint ./trained_models/GPT2_M/e2e/model.2000.pt
+/home/inc/miniconda3/envs/fedadp-new/lib/python3.7/site-packages/torch/nn/_reduction.py:42: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead.
+  warnings.warn(warning.format(ret))
+eval samples: 0 loss: tensor(1.1374, device='cuda:4')
+eval samples: 100 loss: tensor(1.0985, device='cuda:4')
+eval samples: 200 loss: tensor(1.2215, device='cuda:4')
+eval samples: 300 loss: tensor(1.2918, device='cuda:4')
+eval samples: 400 loss: tensor(1.6716, device='cuda:4')
+eval samples: 500 loss: tensor(1.9854, device='cuda:4')
+eval samples: 600 loss: tensor(1.2216, device='cuda:4')
+eval samples: 700 loss: tensor(1.0347, device='cuda:4')
+eval samples: 800 loss: tensor(1.5289, device='cuda:4')
+eval samples: 900 loss: tensor(1.5743, device='cuda:4')
+eval samples: 1000 loss: tensor(1.3339, device='cuda:4')
+eval samples: 1100 loss: tensor(1.3198, device='cuda:4')
+average loss 1.3344345796496084
+----------------------------------------------------------------------------------------------------
+| Eval   1 at step     2000 | time: 137.89s | valid loss  1.33 | valid ppl  3.80 | best ppl  3.80
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     2100 |   2100 batches | lr 0.000188 | ms/batch 1988.14 | loss  2.64 | avg loss  2.68 | ppl 14.57
+| epoch   1 step     2200 |   2200 batches | lr 0.000187 | ms/batch 608.77 | loss  2.45 | avg loss  2.66 | ppl 14.34
+| epoch   1 step     2300 |   2300 batches | lr 0.000186 | ms/batch 610.52 | loss  2.60 | avg loss  2.67 | ppl 14.38
+| epoch   1 step     2400 |   2400 batches | lr 0.000185 | ms/batch 608.14 | loss  2.70 | avg loss  2.67 | ppl 14.49
+| epoch   1 step     2500 |   2500 batches | lr 0.000184 | ms/batch 607.87 | loss  2.52 | avg loss  2.64 | ppl 14.05
+| epoch   1 step     2600 |   2600 batches | lr 0.000184 | ms/batch 608.44 | loss  2.54 | avg loss  2.70 | ppl 14.85
+| epoch   1 step     2700 |   2700 batches | lr 0.000183 | ms/batch 608.49 | loss  2.87 | avg loss  2.69 | ppl 14.72
+| epoch   1 step     2800 |   2800 batches | lr 0.000182 | ms/batch 608.82 | loss  2.44 | avg loss  2.66 | ppl 14.26
+| epoch   1 step     2900 |   2900 batches | lr 0.000181 | ms/batch 609.19 | loss  2.69 | avg loss  2.68 | ppl 14.52
+| epoch   1 step     3000 |   3000 batches | lr 0.000181 | ms/batch 609.05 | loss  2.73 | avg loss  2.64 | ppl 13.99
+saving checkpoint ./trained_models/GPT2_M/e2e/model.3000.pt
+| epoch   1 step     3100 |   3100 batches | lr 0.00018 | ms/batch 609.17 | loss  2.63 | avg loss  2.64 | ppl 14.04
+| epoch   1 step     3200 |   3200 batches | lr 0.000179 | ms/batch 609.50 | loss  2.57 | avg loss  2.66 | ppl 14.28
+| epoch   1 step     3300 |   3300 batches | lr 0.000178 | ms/batch 607.31 | loss  2.47 | avg loss  2.62 | ppl 13.76
+| epoch   1 step     3400 |   3400 batches | lr 0.000178 | ms/batch 604.83 | loss  2.54 | avg loss  2.60 | ppl 13.49
+| epoch   1 step     3500 |   3500 batches | lr 0.000177 | ms/batch 607.92 | loss  2.62 | avg loss  2.63 | ppl 13.90
+| epoch   1 step     3600 |   3600 batches | lr 0.000176 | ms/batch 608.49 | loss  2.41 | avg loss  2.62 | ppl 13.78
+| epoch   1 step     3700 |   3700 batches | lr 0.000175 | ms/batch 605.91 | loss  2.58 | avg loss  2.59 | ppl 13.36
+| epoch   1 step     3800 |   3800 batches | lr 0.000174 | ms/batch 607.54 | loss  2.46 | avg loss  2.64 | ppl 13.97
+| epoch   1 step     3900 |   3900 batches | lr 0.000174 | ms/batch 610.01 | loss  2.68 | avg loss  2.66 | ppl 14.24
+| epoch   1 step     4000 |   4000 batches | lr 0.000173 | ms/batch 607.98 | loss  2.78 | avg loss  2.64 | ppl 14.04
+saving checkpoint ./trained_models/GPT2_M/e2e/model.4000.pt
+eval samples: 0 loss: tensor(1.1133, device='cuda:4')
+eval samples: 100 loss: tensor(1.0210, device='cuda:4')
+eval samples: 200 loss: tensor(1.1742, device='cuda:4')
+eval samples: 300 loss: tensor(1.2072, device='cuda:4')
+eval samples: 400 loss: tensor(1.6256, device='cuda:4')
+eval samples: 500 loss: tensor(1.9378, device='cuda:4')
+eval samples: 600 loss: tensor(1.0971, device='cuda:4')
+eval samples: 700 loss: tensor(1.0210, device='cuda:4')
+eval samples: 800 loss: tensor(1.4538, device='cuda:4')
+eval samples: 900 loss: tensor(1.5298, device='cuda:4')
+eval samples: 1000 loss: tensor(1.2354, device='cuda:4')
+eval samples: 1100 loss: tensor(1.2567, device='cuda:4')
+average loss 1.2714025441506138
+----------------------------------------------------------------------------------------------------
+| Eval   2 at step     4000 | time: 138.19s | valid loss  1.27 | valid ppl  3.57 | best ppl  3.57
+----------------------------------------------------------------------------------------------------
+| epoch   1 step     4100 |   4100 batches | lr 0.000172 | ms/batch 1990.32 | loss  2.81 | avg loss  2.62 | ppl 13.78
+| epoch   1 step     4200 |   4200 batches | lr 0.000171 | ms/batch 608.76 | loss  3.11 | avg loss  2.61 | ppl 13.57
+| epoch   1 step     4300 |   4300 batches | lr 0.000171 | ms/batch 610.45 | loss  2.46 | avg loss  2.61 | ppl 13.63
+| epoch   1 step     4400 |   4400 batches | lr 0.00017 | ms/batch 610.84 | loss  2.96 | avg loss  2.62 | ppl 13.74
+| epoch   1 step     4500 |   4500 batches | lr 0.000169 | ms/batch 611.36 | loss  2.78 | avg loss  2.61 | ppl 13.58
+| epoch   1 step     4600 |   4600 batches | lr 0.000168 | ms/batch 612.08 | loss  2.81 | avg loss  2.57 | ppl 13.07
+| epoch   1 step     4700 |   4700 batches | lr 0.000167 | ms/batch 615.36 | loss  2.90 | avg loss  2.63 | ppl 13.91
+| epoch   1 step     4800 |   4800 batches | lr 0.000167 | ms/batch 611.17 | loss  2.99 | avg loss  2.61 | ppl 13.55
+| epoch   1 step     4900 |   4900 batches | lr 0.000166 | ms/batch 608.81 | loss  2.73 | avg loss  2.60 | ppl 13.47
+| epoch   1 step     5000 |   5000 batches | lr 0.000165 | ms/batch 609.73 | loss  2.50 | avg loss  2.58 | ppl 13.26
+saving checkpoint ./trained_models/GPT2_M/e2e/model.5000.pt
+| epoch   1 step     5100 |   5100 batches | lr 0.000164 | ms/batch 609.36 | loss  2.27 | avg loss  2.59 | ppl 13.33
+| epoch   1 step     5200 |   5200 batches | lr 0.000164 | ms/batch 611.66 | loss  2.39 | avg loss  2.62 | ppl 13.78
+saving checkpoint ./trained_models/GPT2_M/e2e/model.5258.pt
+start to train the model................ 2
+| epoch   2 step     5300 |     42 batches | lr 0.000163 | ms/batch 256.06 | loss  2.41 | avg loss  2.61 | ppl 13.53
+| epoch   2 step     5400 |    142 batches | lr 0.000162 | ms/batch 609.01 | loss  2.63 | avg loss  2.61 | ppl 13.58
+| epoch   2 step     5500 |    242 batches | lr 0.000161 | ms/batch 612.10 | loss  2.45 | avg loss  2.59 | ppl 13.30
+| epoch   2 step     5600 |    342 batches | lr 0.00016 | ms/batch 611.07 | loss  2.67 | avg loss  2.59 | ppl 13.27
+| epoch   2 step     5700 |    442 batches | lr 0.00016 | ms/batch 611.19 | loss  2.52 | avg loss  2.64 | ppl 13.95
+| epoch   2 step     5800 |    542 batches | lr 0.000159 | ms/batch 611.61 | loss  2.87 | avg loss  2.57 | ppl 13.10
+| epoch   2 step     5900 |    642 batches | lr 0.000158 | ms/batch 612.67 | loss  3.17 | avg loss  2.58 | ppl 13.25
+| epoch   2 step     6000 |    742 batches | lr 0.000157 | ms/batch 610.88 | loss  2.45 | avg loss  2.59 | ppl 13.32
+saving checkpoint ./trained_models/GPT2_M/e2e/model.6000.pt
+eval samples: 0 loss: tensor(1.0454, device='cuda:4')
+eval samples: 100 loss: tensor(0.9909, device='cuda:4')
+eval samples: 200 loss: tensor(1.1352, device='cuda:4')
+eval samples: 300 loss: tensor(1.1335, device='cuda:4')
+eval samples: 400 loss: tensor(1.5766, device='cuda:4')
+eval samples: 500 loss: tensor(2.0034, device='cuda:4')
+eval samples: 600 loss: tensor(1.1043, device='cuda:4')
+eval samples: 700 loss: tensor(0.9965, device='cuda:4')
+eval samples: 800 loss: tensor(1.4912, device='cuda:4')
+eval samples: 900 loss: tensor(1.5128, device='cuda:4')
+eval samples: 1000 loss: tensor(1.1385, device='cuda:4')
+eval samples: 1100 loss: tensor(1.2201, device='cuda:4')
+average loss 1.239899498908079
+----------------------------------------------------------------------------------------------------
+| Eval   3 at step     6000 | time: 138.83s | valid loss  1.24 | valid ppl  3.46 | best ppl  3.46
+----------------------------------------------------------------------------------------------------
+| epoch   2 step     6100 |    842 batches | lr 0.000157 | ms/batch 1999.78 | loss  2.55 | avg loss  2.61 | ppl 13.54
+| epoch   2 step     6200 |    942 batches | lr 0.000156 | ms/batch 612.01 | loss  2.72 | avg loss  2.60 | ppl 13.48
+| epoch   2 step     6300 |   1042 batches | lr 0.000155 | ms/batch 611.75 | loss  2.61 | avg loss  2.58 | ppl 13.26
+| epoch   2 step     6400 |   1142 batches | lr 0.000154 | ms/batch 612.29 | loss  2.48 | avg loss  2.58 | ppl 13.15
+| epoch   2 step     6500 |   1242 batches | lr 0.000153 | ms/batch 613.03 | loss  2.90 | avg loss  2.62 | ppl 13.67
+| epoch   2 step     6600 |   1342 batches | lr 0.000153 | ms/batch 611.04 | loss  3.07 | avg loss  2.58 | ppl 13.16
+| epoch   2 step     6700 |   1442 batches | lr 0.000152 | ms/batch 611.17 | loss  2.79 | avg loss  2.56 | ppl 12.96
+| epoch   2 step     6800 |   1542 batches | lr 0.000151 | ms/batch 614.47 | loss  2.50 | avg loss  2.56 | ppl 12.95
+| epoch   2 step     6900 |   1642 batches | lr 0.00015 | ms/batch 610.47 | loss  2.71 | avg loss  2.56 | ppl 12.99
+| epoch   2 step     7000 |   1742 batches | lr 0.00015 | ms/batch 608.59 | loss  2.56 | avg loss  2.59 | ppl 13.37
+saving checkpoint ./trained_models/GPT2_M/e2e/model.7000.pt
+| epoch   2 step     7100 |   1842 batches | lr 0.000149 | ms/batch 610.96 | loss  2.32 | avg loss  2.57 | ppl 13.01
+| epoch   2 step     7200 |   1942 batches | lr 0.000148 | ms/batch 610.97 | loss  2.41 | avg loss  2.53 | ppl 12.50
+| epoch   2 step     7300 |   2042 batches | lr 0.000147 | ms/batch 611.57 | loss  2.48 | avg loss  2.57 | ppl 13.10
+| epoch   2 step     7400 |   2142 batches | lr 0.000146 | ms/batch 610.40 | loss  2.39 | avg loss  2.56 | ppl 12.89
+| epoch   2 step     7500 |   2242 batches | lr 0.000146 | ms/batch 610.66 | loss  2.63 | avg loss  2.57 | ppl 13.04
+| epoch   2 step     7600 |   2342 batches | lr 0.000145 | ms/batch 610.52 | loss  2.63 | avg loss  2.58 | ppl 13.26
+| epoch   2 step     7700 |   2442 batches | lr 0.000144 | ms/batch 608.69 | loss  2.22 | avg loss  2.54 | ppl 12.73
+| epoch   2 step     7800 |   2542 batches | lr 0.000143 | ms/batch 609.99 | loss  2.35 | avg loss  2.57 | ppl 13.07
+| epoch   2 step     7900 |   2642 batches | lr 0.000143 | ms/batch 609.05 | loss  2.72 | avg loss  2.60 | ppl 13.47
+| epoch   2 step     8000 |   2742 batches | lr 0.000142 | ms/batch 609.02 | loss  2.57 | avg loss  2.59 | ppl 13.30
+saving checkpoint ./trained_models/GPT2_M/e2e/model.8000.pt
+eval samples: 0 loss: tensor(1.0535, device='cuda:4')
+eval samples: 100 loss: tensor(0.9691, device='cuda:4')
+eval samples: 200 loss: tensor(1.1137, device='cuda:4')
+eval samples: 300 loss: tensor(1.1214, device='cuda:4')
+eval samples: 400 loss: tensor(1.5688, device='cuda:4')
+eval samples: 500 loss: tensor(1.9425, device='cuda:4')
+eval samples: 600 loss: tensor(1.0476, device='cuda:4')
+eval samples: 700 loss: tensor(0.9898, device='cuda:4')
+eval samples: 800 loss: tensor(1.4776, device='cuda:4')
+eval samples: 900 loss: tensor(1.5046, device='cuda:4')
+eval samples: 1000 loss: tensor(1.1689, device='cuda:4')
+eval samples: 1100 loss: tensor(1.1641, device='cuda:4')
+average loss 1.2270236368456933
+----------------------------------------------------------------------------------------------------
+| Eval   4 at step     8000 | time: 138.04s | valid loss  1.23 | valid ppl  3.41 | best ppl  3.41
+----------------------------------------------------------------------------------------------------
+| epoch   2 step     8100 |   2842 batches | lr 0.000141 | ms/batch 1991.53 | loss  2.46 | avg loss  2.56 | ppl 12.98
+| epoch   2 step     8200 |   2942 batches | lr 0.00014 | ms/batch 609.84 | loss  2.50 | avg loss  2.60 | ppl 13.49
+| epoch   2 step     8300 |   3042 batches | lr 0.00014 | ms/batch 610.87 | loss  2.47 | avg loss  2.54 | ppl 12.72
+| epoch   2 step     8400 |   3142 batches | lr 0.000139 | ms/batch 610.92 | loss  2.41 | avg loss  2.57 | ppl 13.03
+| epoch   2 step     8500 |   3242 batches | lr 0.000138 | ms/batch 611.04 | loss  2.81 | avg loss  2.56 | ppl 12.89
+| epoch   2 step     8600 |   3342 batches | lr 0.000137 | ms/batch 612.82 | loss  2.40 | avg loss  2.55 | ppl 12.87
+| epoch   2 step     8700 |   3442 batches | lr 0.000136 | ms/batch 611.25 | loss  2.47 | avg loss  2.52 | ppl 12.43
+| epoch   2 step     8800 |   3542 batches | lr 0.000136 | ms/batch 611.59 | loss  2.57 | avg loss  2.55 | ppl 12.86
+| epoch   2 step     8900 |   3642 batches | lr 0.000135 | ms/batch 611.43 | loss  2.33 | avg loss  2.54 | ppl 12.62
+| epoch   2 step     9000 |   3742 batches | lr 0.000134 | ms/batch 610.78 | loss  2.96 | avg loss  2.55 | ppl 12.78
+saving checkpoint ./trained_models/GPT2_M/e2e/model.9000.pt
+| epoch   2 step     9100 |   3842 batches | lr 0.000133 | ms/batch 608.39 | loss  2.67 | avg loss  2.55 | ppl 12.81
+| epoch   2 step     9200 |   3942 batches | lr 0.000133 | ms/batch 611.72 | loss  2.65 | avg loss  2.58 | ppl 13.17
+| epoch   2 step     9300 |   4042 batches | lr 0.000132 | ms/batch 611.24 | loss  2.60 | avg loss  2.58 | ppl 13.15
+| epoch   2 step     9400 |   4142 batches | lr 0.000131 | ms/batch 613.45 | loss  2.58 | avg loss  2.56 | ppl 12.95
+| epoch   2 step     9500 |   4242 batches | lr 0.00013 | ms/batch 611.51 | loss  2.40 | avg loss  2.54 | ppl 12.71
+| epoch   2 step     9600 |   4342 batches | lr 0.000129 | ms/batch 613.03 | loss  2.62 | avg loss  2.53 | ppl 12.55
+| epoch   2 step     9700 |   4442 batches | lr 0.000129 | ms/batch 612.45 | loss  2.26 | avg loss  2.54 | ppl 12.74
+| epoch   2 step     9800 |   4542 batches | lr 0.000128 | ms/batch 610.95 | loss  2.78 | avg loss  2.55 | ppl 12.82
+| epoch   2 step     9900 |   4642 batches | lr 0.000127 | ms/batch 608.32 | loss  2.61 | avg loss  2.52 | ppl 12.37
+| epoch   2 step    10000 |   4742 batches | lr 0.000126 | ms/batch 610.72 | loss  2.45 | avg loss  2.54 | ppl 12.73
+saving checkpoint ./trained_models/GPT2_M/e2e/model.10000.pt
+eval samples: 0 loss: tensor(1.0123, device='cuda:4')
+eval samples: 100 loss: tensor(1.0022, device='cuda:4')
+eval samples: 200 loss: tensor(1.0972, device='cuda:4')
+eval samples: 300 loss: tensor(1.1317, device='cuda:4')
+eval samples: 400 loss: tensor(1.5788, device='cuda:4')
+eval samples: 500 loss: tensor(1.9430, device='cuda:4')
+eval samples: 600 loss: tensor(1.0426, device='cuda:4')
+eval samples: 700 loss: tensor(0.9720, device='cuda:4')
+eval samples: 800 loss: tensor(1.4556, device='cuda:4')
+eval samples: 900 loss: tensor(1.4790, device='cuda:4')
+eval samples: 1000 loss: tensor(1.1323, device='cuda:4')
+eval samples: 1100 loss: tensor(1.1691, device='cuda:4')
+average loss 1.2222425683006033
+----------------------------------------------------------------------------------------------------
+| Eval   5 at step    10000 | time: 139.05s | valid loss  1.22 | valid ppl  3.39 | best ppl  3.39
+----------------------------------------------------------------------------------------------------
+| epoch   2 step    10100 |   4842 batches | lr 0.000126 | ms/batch 2003.85 | loss  2.46 | avg loss  2.55 | ppl 12.79
+| epoch   2 step    10200 |   4942 batches | lr 0.000125 | ms/batch 609.56 | loss  2.62 | avg loss  2.56 | ppl 12.88
+| epoch   2 step    10300 |   5042 batches | lr 0.000124 | ms/batch 610.36 | loss  2.85 | avg loss  2.51 | ppl 12.28
+| epoch   2 step    10400 |   5142 batches | lr 0.000123 | ms/batch 610.63 | loss  2.40 | avg loss  2.57 | ppl 13.05
+| epoch   2 step    10500 |   5242 batches | lr 0.000122 | ms/batch 613.64 | loss  2.43 | avg loss  2.52 | ppl 12.45
+saving checkpoint ./trained_models/GPT2_M/e2e/model.10516.pt
+start to train the model................ 3
+| epoch   3 step    10600 |     84 batches | lr 0.000122 | ms/batch 510.61 | loss  2.63 | avg loss  2.53 | ppl 12.61
+| epoch   3 step    10700 |    184 batches | lr 0.000121 | ms/batch 613.48 | loss  2.67 | avg loss  2.56 | ppl 13.00
+| epoch   3 step    10800 |    284 batches | lr 0.00012 | ms/batch 608.43 | loss  2.48 | avg loss  2.52 | ppl 12.39
+| epoch   3 step    10900 |    384 batches | lr 0.000119 | ms/batch 611.59 | loss  2.69 | avg loss  2.56 | ppl 12.91
+Running MS-COCO evaluator...
+creating index...
+index created!
+Loading and preparing results...
+DONE (t=0.00s)
+creating index...
+index created!
+tokenization...
+PTBTokenizer tokenized 22530 tokens at 184928.37 tokens per second.
+PTBTokenizer tokenized 2122 tokens at 21442.98 tokens per second.
+setting up scorers...
+computing METEOR score...
+METEOR: 0.485
+computing Rouge score...
+ROUGE_L: 0.761
+computing CIDEr score...
+CIDEr: 3.314
+Running Py-MTEval metrics...
+SCORES:
+==============
+BLEU: 0.7401
+NIST: 8.6766
+METEOR: 0.4851
+ROUGE_L: 0.7614
+CIDEr: 3.3144
+=== lora.Linear, model.5258.pt ===
+BLEU: 0.7905
+NIST: 9.1684
+METEOR: 0.5016
+ROUGE_L: 0.7865
+CIDEr: 3.4686
+=== lora.MergedLinear, model.26290.pt ===

examples/NLG/src/model.py ADDED Viewed

	@@ -0,0 +1,460 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import logging
+import math
+import os
+from collections import OrderedDict
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+import torch.nn.functional as F
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+from torch.nn.parameter import Parameter
+import loralib as lora
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+def gelu_fast(x):
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+def swish(x):
+    return x * torch.sigmoid(x)
+def _gelu_python(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        This is now written in C in torch.nn.functional
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+class LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root)."""
+        super(LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = Parameter(w)
+        self.bias = Parameter(torch.zeros(nf))
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_attn = lora.MergedLinear(
+            nx, n_state * 3,
+            r=config.lora_attn_dim,
+            lora_alpha=config.lora_attn_alpha,
+            lora_dropout=config.lora_dropout,
+            enable_lora=[True, False, True],
+            fan_in_fan_out=True,
+            merge_weights=False
+        )
+        # self.c_attn = lora.Linear(
+        #     nx, n_state * 3,
+        #     r=config.lora_attn_dim,
+        #     lora_alpha=config.lora_attn_alpha,
+        #     lora_dropout=config.lora_dropout,
+        #     fan_in_fan_out=True,
+        #     merge_weights=False
+        # )
+        print(f"scaling = {config.lora_attn_alpha / config.lora_attn_dim}")
+        self.c_proj = Conv1D(n_state, nx)
+        self.config = config
+    def _attn(self, q, k, v, len_kv=None):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        nd, ns = w.size(-2), w.size(-1)
+        b = self.bias[:, :, ns-nd:ns, :ns]
+        w = w * b - 1e10 * (1 - b)
+        # q : (batch, head, q_seq_length, head_features)
+        # k : (batch, head, head_features, kv_seq_length)
+        # w : (batch, head, q_seq_length, kv_seq_length)
+        # v : (batch, head, kv_seq_length, head_features)
+        if len_kv is not None:
+            _len = torch.arange(k.size(-1), device=k.device)
+            _input_msk =  _len[None, :] >= (len_kv)[:, None]
+            w = w.masked_fill(_input_msk.unsqueeze(1).unsqueeze(2), -1.0e10)
+        w = nn.Softmax(dim=-1)(w)
+        return torch.matmul(w, v)
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1).contiguous()  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3).contiguous()  # (batch, head, seq_length, head_features)
+    def forward(self, x, history=None, layer_past=None, len_past=None):
+        hidden_states = x
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        #_input_msk = None
+        len_kv = None
+        if layer_past is not None:
+            # key : (batch, head, head_features, seq_length)
+            # value : (batch, head, seq_length, head_features)
+            # layer_past, key : (batch, head, seq_length, head_features)
+            if len_past is None:
+                past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+                key = torch.cat((past_key, key), dim=-1)
+                value = torch.cat((past_value, value), dim=-2)
+            else:
+                key_seq = key.shape[-1]
+                assert key_seq == 1
+                _batch = torch.arange(0, key.shape[0], dtype=torch.long, device=key.device)
+                past_key, past_value = layer_past[0], layer_past[1]
+                past_key[_batch,:,len_past,:] = key.squeeze(-1)
+                past_value[_batch,:,len_past,:] = value.squeeze(-2)
+                key = past_key.transpose(-2, -1)
+                value = past_value
+                len_kv = len_past + 1
+        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+        a = self._attn(query, key, value, len_kv = len_kv)
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        # logging.info(f"attention forward: {a[0,0,:100]}, present: {present[0,0,0,:]}")
+        return a, present
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = gelu
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return h2
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+    def forward(self, x, layer_past=None, len_past=None):
+        a, present = self.attn(self.ln_1(x), layer_past=layer_past, len_past=len_past)
+        x = x + a
+        m = self.mlp(self.ln_2(x))
+        x = x + m
+        return x, present
+class GPT2Model(nn.Module):
+    def __init__(self, config):
+        super(GPT2Model, self).__init__()
+        self.n_layer = config.n_layer
+        self.n_embd = config.n_embd
+        self.n_vocab = config.vocab_size
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        block = Block(config.n_ctx, config, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.config = config
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        past=None,
+        len_past=None
+    ):
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        elif len_past is None:
+            # equal size for past. []
+            past_length = past[0][0].size(-2)
+        if position_ids is None and len_past is None:
+            position_ids = torch.arange(
+                past_length, input_ids.size(-1) + past_length,
+                dtype=torch.long, device=input_ids.device
+            )
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        elif len_past is not None:
+            position_ids = (len_past).unsqueeze(1) #.long()
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        presents = []
+        for block, layer_past in zip(self.h, past):
+            hidden_states, present = block(hidden_states, layer_past = layer_past, len_past=len_past)
+            presents.append(present)
+        hidden_states = self.ln_f(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        return hidden_states.view(*output_shape), presents
+class GPT2LMHead(nn.Module):
+    def __init__(self, model_embeddings_weights, config):
+        super(GPT2LMHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.set_embeddings_weights(model_embeddings_weights)
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.decoder.weight = model_embeddings_weights  # Tied weights
+    def forward(self, hidden_state):
+        # Truncated Language modeling logits (we remove the last token)
+        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
+        lm_logits = self.decoder(hidden_state)
+        return lm_logits
+class GPT2Config(object):
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        lora_attn_dim=0,
+        lora_attn_alpha=128,
+        lora_dropout=0.0,
+        lora_r_dropout=0.0,
+        fix_dropout=0.0,
+    ):
+        self.vocab_size = vocab_size_or_config_json_file
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.lora_attn_dim = lora_attn_dim
+        self.lora_attn_alpha = lora_attn_alpha
+        self.lora_dropout = lora_dropout
+        self.lora_r_dropout = lora_r_dropout
+        self.fix_dropout = fix_dropout
+class GPT2LMModel(nn.Module):
+    def __init__(self, config):
+        super(GPT2LMModel, self).__init__()
+        self.transformer = GPT2Model(config)
+        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.apply(self._init_weights)
+    def set_tied(self):
+        """ Make sure we are sharing the embeddings"""
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+    def forward(
+        self,
+        input_ids,
+        lm_labels=None,
+        lm_mask=None,
+        past=None,
+        len_past=None,
+        label_smooth=0.0,
+        is_report_accuracy=False
+    ):
+        _batch, _len = input_ids.shape
+        hidden_states, presents = self.transformer(input_ids, past=past, len_past=len_past)
+        # batch, seq, vocab
+        lm_logits = self.lm_head(hidden_states)
+        if lm_labels is not None:
+            if is_report_accuracy:
+                _pred_token = torch.argmax(lm_logits, dim=-1)
+                _hit = (_pred_token == lm_labels) * lm_mask
+                _t1_acc = torch.zeros(_batch, dtype=torch.float, device=input_ids.device)
+                _all_acc = torch.zeros(_batch, dtype=torch.float, device=input_ids.device)
+                for _b in range(0, _batch):
+                    for _i in range(0, _len):
+                        if lm_mask[_b, _i] >= 1.0:
+                            if _hit[_b, _i] > 0:
+                                _t1_acc[_b] = 1.0
+                            break
+                    _is_succ = True
+                    for _i in range(0, _len):
+                        if lm_mask[_b, _i] >= 1.0:
+                            if _hit[_b, _i] <= 0:
+                                _is_succ = False
+                                break
+                    if _is_succ:
+                        _all_acc[_b] = 1.0
+                #_t1_acc = _t1_acc * 1.0 / _batch
+                #_all_acc = _all_acc * 1.0 / _batch
+            if label_smooth > 0.0001:
+                logprobs = torch.nn.functional.log_softmax(lm_logits.view(-1, lm_logits.size(-1)), dim=-1)
+                nll_loss = -logprobs.gather(dim=-1, index=lm_labels.view(-1).unsqueeze(1))
+                nll_loss = nll_loss.squeeze(1)
+                smooth_loss = -logprobs.mean(dim=-1)
+                loss = (1.0 - label_smooth) * nll_loss + label_smooth * smooth_loss
+                loss = loss.view(_batch, _len)
+            else:
+                loss_fct = nn.CrossEntropyLoss(ignore_index=-1, reduce=False)
+                loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)).view(_batch, _len)
+            if lm_mask is None:
+                lm_mask = torch.ones(loss.shape, dtype=loss.dtype, device=loss.device)
+            loss = loss * lm_mask
+            loss = loss.sum() / (lm_mask.sum() + 0.0001)
+            if is_report_accuracy:
+                return lm_logits, loss, _t1_acc, _all_acc
+            else:
+                return lm_logits, loss
+        return lm_logits, presents
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def load_weight(self, state_dict):
+        if 'model_state_dict' in state_dict:
+            state_dict = state_dict['model_state_dict']
+        state_dict_tmp = copy.deepcopy(state_dict)
+        old_keys = []
+        new_keys = []
+        for key in state_dict_tmp:
+            new_key = None
+            if key.endswith(".g"):
+                new_key = key[:-2] + ".weight"
+            elif key.endswith(".b"):
+                new_key = key[:-2] + ".bias"
+            elif key.endswith(".w"):
+                new_key = key[:-2] + ".weight"
+            if key.startswith("module.transformer."):
+                new_key = key[len("module.transformer."):]
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+        for n, p in self.transformer.named_parameters():
+            if n not in state_dict:
+                state_dict[n] = p
+        self.transformer.load_state_dict(state_dict, strict=False)
+        self.set_tied()