Spaces:

andreslu
/

orion

Runtime error

App Files Files Community

andreslu commited on Mar 28, 2023

Commit

0f14897

•

1 Parent(s): 5abcb03

Upload 25 files

Browse files

Files changed (25) hide show

evaluation.py +254 -0
expbert.py +282 -0
inductor.py +401 -0
src/__pycache__/bart_with_group_beam.cpython-38.pyc +0 -0
src/bart_with_group_beam.py +608 -0
src/distinct_n/.gitignore +58 -0
src/distinct_n/.idea/Distinct-N.iml +11 -0
src/distinct_n/.idea/encodings.xml +4 -0
src/distinct_n/.idea/misc.xml +7 -0
src/distinct_n/.idea/modules.xml +8 -0
src/distinct_n/.idea/other.xml +6 -0
src/distinct_n/.idea/vcs.xml +6 -0
src/distinct_n/.idea/webResources.xml +14 -0
src/distinct_n/A Diversity-Promoting Objective Function for Neural Conversation Models.pdf +0 -0
src/distinct_n/LICENSE.txt +202 -0
src/distinct_n/README.md +30 -0
src/distinct_n/bin/distinct_metric.py +29 -0
src/distinct_n/bin/score.sh +6 -0
src/distinct_n/distinct_n/metrics.py +33 -0
src/distinct_n/distinct_n/test.py +32 -0
src/distinct_n/distinct_n/utils.py +90 -0
src/distinct_n/setup.py +29 -0
src/distinct_n/testdata/bigram.txt +1 -0
src/distinct_n/testdata/unigram.txt +1 -0
src/utils.py +133 -0

evaluation.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import argparse
+import logging
+import re
+from datetime import datetime
+import os
+import numpy as np
+import torch
+from nltk import bleu, meteor
+from rouge_score.rouge_scorer import RougeScorer
+from tqdm import tqdm
+from src.distinct_n.distinct_n.metrics import distinct_n_corpus_level as distinct_n
+from inductor import BartInductor, CometInductor
+FILES = {
+    'amie-yago2': 'data/RE-datasets/AMIE-yago2.txt',
+    'rules-yago2': 'data/RE-datasets/RuLES-yago2.txt',
+    "openrule155": "data/OpenRule155.txt",
+    'fewrel': 'data/RE/fewrel-5.txt',
+    'semeval': 'data/RE/semeval-5.txt',
+    'TREx': 'data/RE/trex-5.txt',
+    'nyt10': 'data/RE/nyt10-5.txt',
+    'google-re': 'data/RE/google-re-5.txt',
+    'wiki80': 'data/RE/wiki80-5.txt',
+}
+if not os.path.exists('logs/'):
+    os.mkdir('logs/')
+logging.basicConfig(
+    filename='logs/evaluation-{}.log'.format(str(datetime.now())),
+    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+    datefmt='%m/%d/%Y %H:%M:%S',
+    level=logging.INFO)
+logger = logging.getLogger(__name__)
+def print_config(config):
+    config = vars(config)
+    logger.info("**************** MODEL CONFIGURATION ****************")
+    for key in sorted(config.keys()):
+        val = config[key]
+        keystr = "{}".format(key) + (" " * (25 - len(key)))
+        logger.info("{} -->   {}".format(keystr, val))
+    logger.info("**************** MODEL CONFIGURATION ****************")
+scorer = RougeScorer(['rougeL'], use_stemmer=True)
+def rouge(references, hypothesis):
+    scores = []
+    for reference in references:
+        scores.append(
+            scorer.score(
+                reference,
+                hypothesis)['rougeL'][2]
+        )
+    return max(scores)
+class RelationExtractionEvaluator(object):
+    def __init__(self, args):
+        self.args = args
+        if self.args.inductor == 'rule':
+            self.inductor = BartInductor(
+                group_beam=self.args.group_beam,
+                continue_pretrain_instance_generator=self.args.mlm_training,
+                continue_pretrain_hypo_generator=self.args.bart_training,
+                if_then=self.args.if_then,
+            )
+        elif self.args.inductor == 'comet':
+            self.inductor = CometInductor()
+    def clean(self, text):
+        segments = text.split('<mask>')
+        if len(segments) == 3 and segments[2].startswith('.'):
+            return '<mask>'.join(segments[:2]) + '<mask>.'
+        else:
+            return text
+    def clean_references(self, texts):
+        for i, text in enumerate(texts):
+            if text.endswith(" ."):
+                texts[i] = text.replace(" .", ".")
+        return texts
+    def self_bleu(self, hypothesis):
+        bleus = []
+        for i in range(len(hypothesis)):
+            bleus.append(bleu(
+                hypothesis[:i] + hypothesis[i + 1:],
+                hypothesis[i],
+                weights=(0.5, 0.5)))
+        ret = np.mean(bleus)
+        return ret
+    def evaluate(self, task):
+        with torch.no_grad():
+            self.metrics = {
+                "bleu-4": [],
+                "bleu-3": [],
+                "bleu-2": [],
+                "bleu-1": [],
+                "METEOR": [],
+                "ROUGE-L": [],
+                "self-BLEU-2": [],
+            }
+            with open(FILES[task], 'r', encoding='utf-8') as file:
+                data = file.readlines()
+                with tqdm(total=len(data)) as pbar:
+                    for row in data:
+                        pbar.update(1)
+                        row = row.strip().split('\t')
+                        inputs, head, tail, relations = row[0], row[1], row[2], row[3]
+                        inputs = inputs.strip()
+                        if relations.startswith('[') and relations.endswith(']'):
+                            inputs = re.sub("<A>|<B>", "<mask>", inputs)
+                            references = [relation.replace('<A>', '<mask>').replace('<B>', '<mask>').lower().strip() for relation in eval(relations)]
+                        else:
+                            references = [relations.replace('[X]', '<mask>').replace('[Y]', '<mask>').lower().strip()]
+                        references = self.clean_references(references)
+                        hypothesis = self.inductor.generate(inputs, k=10, topk=10)
+                        logger.info("***********Input************")
+                        logger.info(inputs)
+                        logger.info("*********Hypothesis*********")
+                        for i, hypo in enumerate(hypothesis):
+                            hypothesis[i] = self.clean(hypo.lower().strip())
+                            logger.info(hypo)
+                        logger.info("****************************")
+                        logger.info("*********References*********")
+                        logger.info(references)
+                        logger.info("****************************")
+                        if len(hypothesis) == 0:
+                            for k in self.metrics.keys():
+                                if k != 'self-BLEU-2':
+                                    self.metrics[k].append(0.)
+                        else:
+                            for hypo in hypothesis:
+                                try:
+                                    self.metrics['bleu-4'].append(
+                                        bleu(
+                                            [reference.split() for reference in references],
+                                            hypo.split(),
+                                            weights=(0.25, 0.25, 0.25, 0.25)
+                                        )
+                                    )
+                                except Exception:
+                                    logger.warning("Skip bleu-4 in example: {}".format(inputs))
+                                    pass
+                                try:
+                                    self.metrics['bleu-3'].append(
+                                        bleu(
+                                            [reference.split() for reference in references],
+                                            hypo.split(),
+                                            weights=(1 / 3, ) * 3
+                                        )
+                                    )
+                                except Exception:
+                                    logger.warning("Skip bleu-3 in example: {}".format(inputs))
+                                    pass
+                                try:
+                                    self.metrics['bleu-2'].append(
+                                        bleu(
+                                            [reference.split() for reference in references],
+                                            hypo.split(),
+                                            weights=(0.5, 0.5)
+                                        )
+                                    )
+                                except Exception:
+                                    logger.warning("Skip bleu-2 in example: {}".format(inputs))
+                                    pass
+                                try:
+                                    self.metrics['bleu-1'].append(
+                                        bleu(
+                                            [reference.split() for reference in references],
+                                            hypo.split(),
+                                            weights=(1.0, )
+                                        )
+                                    )
+                                except Exception:
+                                    logger.warning("Skip bleu-1 in example: {}".format(inputs))
+                                    pass
+                                try:
+                                    self.metrics['METEOR'].append(
+                                        meteor(
+                                            references,
+                                            hypo,
+                                        )
+                                    )
+                                except:
+                                    logger.warning("Skip METEOR in example: {}".format(inputs))
+                                    pass
+                                try:
+                                    self.metrics['ROUGE-L'].append(
+                                        rouge(
+                                            references,
+                                            hypo,
+                                        )
+                                    )
+                                except:
+                                    logger.warning("Skip ROUGE-L in example: {}".format(inputs))
+                                    pass
+                            try:
+                                self.metrics['self-BLEU-2'].append(
+                                    self.self_bleu(
+                                        hypothesis,
+                                    )
+                                )
+                            except:
+                                logger.warning("Skip self-bleu-2 in example: {}.".format(inputs))
+                                pass
+                        # break
+            self.print(task, self.metrics)
+    def print(self, task, metrics):
+        logger.info("Task: {}".format(str(task)))
+        for k, v in metrics.items():
+            logger.info("{}: {}".format(k, str(np.mean(v))))
+        logger.info("*******************************************************")
+        logger.info("*******************************************************")
+        logger.info("*******************************************************")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inductor", type=str, default='rule')
+    parser.add_argument("--group_beam", type=bool, default=False)
+    parser.add_argument("--mlm_training", type=bool, default=False)
+    parser.add_argument("--bart_training", type=bool, default=False)
+    parser.add_argument("--if_then", type=bool, default=False)
+    parser.add_argument("--task", type=str, default='openrule155')
+    args = parser.parse_args()
+    print_config(args)
+    evaluator = RelationExtractionEvaluator(args)
+    evaluator.evaluate(args.task)

expbert.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import argparse
+import logging
+import os
+import random
+from datetime import datetime
+import numpy as np
+import torch
+from sklearn.metrics import accuracy_score, f1_score
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from transformers import (AutoConfig, AutoModel,
+                          AutoModelForSequenceClassification, AutoTokenizer,
+                          BertForSequenceClassification, BertModel)
+if not os.path.exists('logs/'):
+    os.mkdir('logs/')
+logging.basicConfig(
+    filename='logs/expbert-{}.log'.format(str(datetime.now())),
+    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+    datefmt='%m/%d/%Y %H:%M:%S',
+    level=logging.INFO)
+logger = logging.getLogger(__name__)
+TASK2PATH = {
+    "disease-train": "data/disease/train.txt",
+    "disease-test": "data/disease/test.txt",
+    "spouse-train": "data/spouse/train.txt",
+    "spouse-test": "data/spouse/test.txt",
+}
+ANNOTATED_EXP = {
+    "spouse": "data/exp/expbert_spouse_explanation.txt",
+    "disease": "data/exp/expbert_disease_explanation.txt",
+}
+GENERATED_EXP = {
+    "spouse": "data/exp/orion_spouse_explanation.txt",
+    "disease": "data/exp/orion_disease_explanation.txt",
+}
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def print_config(config):
+    config = vars(config)
+    logger.info("**************** MODEL CONFIGURATION ****************")
+    for key in sorted(config.keys()):
+        val = config[key]
+        keystr = "{}".format(key) + (" " * (25 - len(key)))
+        logger.info("{} -->   {}".format(keystr, val))
+    logger.info("**************** MODEL CONFIGURATION ****************")
+class ExpBERT(nn.Module):
+    def __init__(self, args, exp_num):
+        super(ExpBERT, self).__init__()
+        self.args = args
+        self.exp_num = exp_num
+        self.config = AutoConfig.from_pretrained(args.model)
+        self.model = AutoModel.from_pretrained(args.model, config=self.config)
+        self.dropout = nn.Dropout(p=0.1)
+        self.linear = nn.Linear(self.config.hidden_size * exp_num, 2)
+        self.criterion = nn.CrossEntropyLoss()
+    def forward(self, inputs):
+        for k, v in inputs["encoding"].items():
+            inputs["encoding"][k] = v.cuda()
+        pooler_output = self.model(**inputs["encoding"]).last_hidden_state[:, 0, :].reshape(1, self.exp_num * self.config.hidden_size)
+        pooler_output = self.dropout(pooler_output)
+        logits = self.linear(pooler_output)
+        loss = self.criterion(logits, torch.LongTensor([inputs["label"]]).cuda())
+        prediction = torch.argmax(logits)
+        return {
+            "loss": loss,
+            "prediction": prediction,
+        }
+class REDataset(Dataset):
+    def __init__(self, path, exp, tokenizer):
+        super(REDataset, self).__init__()
+        self.tokenizer = tokenizer
+        self.exp = exp
+        self.sentences = []
+        self.labels = []
+        self.entities = []
+        with open(path, "r", encoding="utf-8") as file:
+            data = file.readlines()
+            for example in data:
+                sentence, entity1, entity2, id, label = example.strip().split("\t")
+                self.sentences.append(sentence)
+                if eval(label) == 1:
+                    self.labels.append(1)
+                elif eval(label) == -1:
+                    self.labels.append(0)
+                self.entities.append([entity1, entity2])
+        logger.info("Number of Example in {}: {}".format(path, str(len(self.labels))))
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, index):
+        return {
+            "sentence": self.sentences[index],
+            "entity": self.entities[index],
+            "label": self.labels[index],
+        }
+    def collate_fn(self, batch):
+        outputs = []
+        for ex in batch:
+            temp = []
+            for exp in self.exp:
+                if "{e1}" in exp or "{e2}" in exp:
+                    exp = exp.replace("{e1}", ex["entity"][0]).replace("{e2}", ex["entity"][1])
+                else:
+                    for entity in ex["entity"]:
+                        index = exp.index('<mask>')
+                        exp = exp[:index] + entity + exp[index + len('<mask>'):]
+                temp.append(exp)
+            outputs.append(
+                {
+                    "encoding": self.tokenizer(
+                                    [ex["sentence"]] * len(temp), temp,
+                                    add_special_tokens=True,
+                                    padding="longest",
+                                    truncation=True,
+                                    max_length=156,
+                                    return_tensors="pt",
+                                    return_attention_mask=True,
+                                    return_token_type_ids=True,
+                                ),
+                    "label": ex["label"],
+                }
+            )
+        return outputs
+    def collate_fn_(self, batch):
+        texts = []
+        labels = []
+        for ex in batch:
+            texts.append(ex["sentence"])
+            labels.append(ex["label"])
+        outputs = self.tokenizer(
+            texts,
+            add_special_tokens=True,
+            padding="longest",
+            truncation=True,
+            max_length=156,
+            return_tensors="pt",
+            return_attention_mask=True,
+            return_token_type_ids=True,
+        )
+        outputs["labels"] = torch.LongTensor(labels)
+        return outputs
+class Trainer(object):
+    def __init__(self, args):
+        self.args = args
+        print_config(args)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.args.model)
+        TASK2EXP = GENERATED_EXP if args.generated_rules else ANNOTATED_EXP
+        with open(TASK2EXP[args.task], "r", encoding="utf-8") as file:
+            exp = file.readlines()
+        self.train_dataset = REDataset(TASK2PATH['{}-train'.format(args.task)], exp, self.tokenizer)
+        self.test_dataset = REDataset(TASK2PATH['{}-test'.format(args.task)], exp, self.tokenizer)
+        self.model = AutoModelForSequenceClassification.from_pretrained(args.model).cuda() if self.args.no_exp else ExpBERT(args, len(exp)).cuda()
+        self.train_loader = DataLoader(
+            self.train_dataset,
+            batch_size=args.batch_size,
+            shuffle=args.shuffle,
+            collate_fn=self.train_dataset.collate_fn_ if self.args.no_exp else self.train_dataset.collate_fn,
+        )
+        self.test_loader = DataLoader(
+            self.test_dataset,
+            batch_size=args.batch_size,
+            shuffle=args.shuffle,
+            collate_fn=self.test_dataset.collate_fn_ if self.args.no_exp else self.test_dataset.collate_fn,
+        )
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.args.learning_rate)
+    def compute_metrics(self, labels, predictions):
+        accuracy = accuracy_score(y_pred=predictions, y_true=labels)
+        f1 = f1_score(y_pred=predictions, y_true=labels)
+        return accuracy, f1
+    def train(self):
+        self.model.train()
+        self.test(-1)
+        for e in range(self.args.epochs):
+            with tqdm(total=len(self.train_loader)) as pbar:
+                for step, examples in enumerate(self.train_loader):
+                    self.model.zero_grad()
+                    if self.args.no_exp:
+                        for k, v in examples.items():
+                            examples[k] = v.cuda()
+                        outputs = self.model(**examples)
+                        outputs.loss.backward()
+                    else:
+                        for ex in examples:
+                            outputs = self.model(ex)
+                            (outputs["loss"] / len(examples)).backward()
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                    self.optimizer.step()
+                    pbar.update(1)
+            self.test(e)
+    def test(self, epoch):
+        self.model.eval()
+        with torch.no_grad():
+            with tqdm(total=len(self.test_loader)) as pbar:
+                loss = []
+                labels = []
+                predictions = []
+                for step, examples in enumerate(self.test_loader):
+                    if self.args.no_exp:
+                        for k, v in examples.items():
+                            examples[k] = v.cuda()
+                        outputs = self.model(**examples)
+                        loss.append(outputs.loss.float())
+                        labels.extend(examples["labels"].tolist())
+                        predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
+                    else:
+                        for ex in examples:
+                            labels.append(ex['label'])
+                            outputs = self.model(ex)
+                            loss.append(outputs["loss"].item())
+                            predictions.append(outputs['prediction'].tolist())
+                    pbar.update(1)
+                accuracy, f1 = self.compute_metrics(predictions, labels)
+            logger.info("[EPOCH {}] Accuracy: {} | F1-Score: {}. (Number of Data {})".format(epoch, accuracy, f1, len(predictions)))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str, default="spouse")
+    parser.add_argument("--model", type=str, default="bert-base-uncased")
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--learning_rate", type=float, default=2e-5)
+    parser.add_argument("--shuffle", type=bool, default=False)
+    parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--no_exp", type=bool, default=False)
+    parser.add_argument("--generated_rules", type=bool, default=False)
+    args = parser.parse_args()
+    for seed in range(42, 47):
+        set_random_seed(seed)
+        trainer = Trainer(args)
+        trainer.train()

inductor.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import re
+from copy import deepcopy
+import argparse
+import torch
+import torch.nn.functional as F
+from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
+                          BartForConditionalGeneration, BartTokenizer,)
+from src.bart_with_group_beam import BartForConditionalGeneration_GroupBeam
+from src.utils import (construct_template, filter_words,
+                       formalize_tA, post_process_template)
+ORION_HYPO_GENERATOR = 'chenxran/orion-hypothesis-generator'
+ORION_INS_GENERATOR = 'chenxran/orion-instance-generator'
+RELATIONS = [
+    "Causes",
+    "HasProperty",
+    "MadeUpOf",
+    "isAfter",
+    "isBefore",
+    "xReact",
+    "xWant",
+    "xReason",
+    "xAttr",
+    "Desires",
+]
+class BartInductor(object):
+    def __init__(
+        self,
+        group_beam=True,
+        continue_pretrain_instance_generator=True,
+        continue_pretrain_hypo_generator=True,
+        if_then=False
+    ):
+        self.if_then = if_then
+        self.orion_instance_generator_path = 'facebook/bart-large' if not continue_pretrain_instance_generator else ORION_INS_GENERATOR
+        self.orion_hypothesis_generator_path = 'facebook/bart-large' if not continue_pretrain_hypo_generator else ORION_HYPO_GENERATOR
+        if group_beam:
+            self.orion_hypothesis_generator = BartForConditionalGeneration_GroupBeam.from_pretrained(self.orion_hypothesis_generator_path).cuda().eval().half()
+        else:
+            self.orion_hypothesis_generator = BartForConditionalGeneration.from_pretrained(self.orion_hypothesis_generator_path).cuda().eval().half()
+        self.orion_instance_generator = BartForConditionalGeneration.from_pretrained(self.orion_instance_generator_path).cuda().eval().half()
+        self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+        self.word_length = 2
+        self.stop_sub_list = ['he', 'she', 'this', 'that', 'and', 'it', 'which', 'who', 'whose', 'there', 'they', '.', 'its', 'one',
+                                'i', ',', 'the', 'nobody', 'his', 'her', 'also', 'only', 'currently', 'here', '()', 'what', 'where',
+                                'why', 'a', 'some', '"', ')', '(', 'now', 'everyone', 'everybody', 'their', 'often', 'usually', 'you',
+                                '-', '?', ';', 'in', 'on', 'each', 'both', 'him', 'typically', 'mostly', 'sometimes', 'normally',
+                                'always', 'usually', 'still', 'today', 'was', 'were', 'but', 'although', 'current', 'all', 'have',
+                                'has', 'later', 'with', 'most', 'nowadays', 'then', 'every', 'when', 'someone', 'anyone', 'somebody',
+                                'anybody', 'any', 'being', 'get', 'getting', 'thus', 'under', 'even', 'for', 'can', 'rarely', 'never',
+                                'may', 'generally', 'other', 'another', 'too', 'first', 'second', 'third', 'mainly', 'primarily',
+                                'having', 'have', 'has']
+        self.stop_size = len(self.stop_sub_list)
+        for i in range(self.stop_size):
+            if self.stop_sub_list[i][0].isalpha():
+                temp = self.stop_sub_list[i][0].upper() + self.stop_sub_list[i][1:]
+                self.stop_sub_list.append(temp)
+        self.bad_words_ids = [self.tokenizer.encode(bad_word)[1:-1] for bad_word in ['also', ' also']]
+        stop_index = self.tokenizer(self.stop_sub_list, max_length=4, padding=True)
+        stop_index = torch.tensor(stop_index['input_ids'])[:, 1]
+        stop_weight = torch.zeros(1, self.tokenizer.vocab_size).cuda()
+        stop_weight[0, stop_index] -= 100
+        self.stop_weight = stop_weight[0, :]
+    def clean(self, text):
+        segments = text.split('<mask>')
+        if len(segments) == 3 and segments[2].startswith('.'):
+            return '<mask>'.join(segments[:2]) + '<mask>.'
+        else:
+            return text
+    def generate(self, inputs, k=10, topk=10):
+        with torch.no_grad():
+            tB_probs = self.generate_rule(inputs, k)
+            ret = [t[0].replace('<ent0>','<mask>').replace('<ent1>','<mask>') for t in tB_probs]
+            new_ret = []
+            for temp in ret:
+                temp = self.clean(temp.strip())
+                if len(new_ret) < topk and temp not in new_ret:
+                    new_ret.append(temp)
+            return new_ret
+    def explore_mask(self, tA, k, tokens, prob, required_token, probs):
+        if required_token == 0:
+            return [[tokens, prob, probs]]
+        if required_token <= self.word_length:
+            k = min(k, 2)
+        ret = []
+        generated_ids = self.tokenizer(tA, max_length=128, padding='longest', return_tensors='pt')  # ["input_ids"].cuda()
+        for key in generated_ids.keys():
+            generated_ids[key] = generated_ids[key].cuda()
+        mask_index = torch.where(generated_ids["input_ids"][0] == self.tokenizer.mask_token_id)
+        generated_ret = self.orion_instance_generator(**generated_ids)
+        #logits = generated_ret.logits
+        logits = generated_ret[0]
+        softmax = F.softmax(logits, dim=-1)
+        mask_word = softmax[0, mask_index[0][0], :] + self.stop_weight
+        top_k = torch.topk(mask_word, k, dim=0)
+        for i in range(top_k[1].size(0)):
+            token_s = top_k[1][i]
+            prob_s = top_k[0][i].item()
+            token_this = self.tokenizer.decode([token_s]).strip()
+            if token_this[0].isalpha() == False or len(token_this) <= 2:
+                continue
+            index_s = tA.index(self.tokenizer.mask_token)
+            tAs = tA[:index_s] + token_this + tA[index_s + len(self.tokenizer.mask_token):]
+            tokens_this = [t for t in tokens]
+            tokens_this.append(token_this)
+            probs_new = deepcopy(probs)
+            probs_new.append(prob_s)
+            ret.extend(self.explore_mask(tAs, 1, tokens_this, prob_s * prob, required_token - 1,probs_new))
+        return ret
+    def extract_words_for_tA_bart(self, tA, k=6, print_it = False):
+        spans = [t.lower().strip() for t in tA[:-1].split('<mask>')]
+        generated_ids = self.tokenizer([tA], padding='longest', return_tensors='pt')['input_ids'].cuda()
+        generated_ret = self.orion_instance_generator.generate(generated_ids, num_beams=max(120, k),
+                                            #num_beam_groups=max(120, k),
+                                            max_length=generated_ids.size(1) + 15,
+                                            num_return_sequences=max(120, k), #min_length=generated_ids.size(1),
+                                            #diversity_penalty=2.0,
+                                            #length_penalty= 0.8,
+                                            #early_stopping=True, bad_words_ids=bad_words_ids, no_repeat_ngram_size=2,
+                                            output_scores=True,
+                                            return_dict_in_generate=True)
+        summary_ids = generated_ret['sequences']
+        probs = F.softmax(generated_ret['sequences_scores'])
+        txts = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids]
+        ret = []
+        for i, txt in enumerate(txts):
+            if tA.endswith('.'):
+                if txt.endswith('.'):
+                    txt = txt[:-1].strip()
+                txt += '.'
+            word_imcomplete = False
+            prob = probs[i].item()
+            words_i = []
+            start_index = 0
+            for j in range(len(spans)-1):
+                span1 = spans[j]
+                span2 = spans[j+1]
+                if (span1 in txt.lower()[start_index:]) and (span2 in txt.lower()[start_index:]):
+                    index1 = txt.lower().index(span1,start_index)+len(span1)
+                    if span2 == '':
+                        if txt[-1] == '.':
+                            index2 = len(txt) -1
+                        else:
+                            index2 = len(txt)
+                    else:
+                        index2 = txt.lower().index(span2, start_index)
+                    words_i.append(txt[index1:index2].strip())
+                    start_index = index2
+                    #if words_i[-1] == '':
+                    #    word_imcomplete = True
+                else:
+                    word_imcomplete = True
+            if word_imcomplete:
+                # if print_it:
+                    # print(txt + '\t' + tA + '\t' + '×')
+                continue
+            ret.append([words_i, prob])
+        return sorted(ret, key=lambda x: x[1], reverse=True)[:k]
+    def extract_words_for_tA(self, tA, k=6):
+        word_mask_str = ' '.join([self.tokenizer.mask_token] * self.word_length)
+        tA = tA.replace('<mask>', word_mask_str)
+        mask_count = tA.count(self.tokenizer.mask_token)
+        mask_probs = self.explore_mask(tA, k*20, [], 1.0, mask_count, [])
+        ret = []
+        visited_mask_txt = {}
+        for mask, prob, probs in mask_probs:
+            mask_txt = ' '.join(mask).lower()
+            if mask_txt in visited_mask_txt:
+                continue
+            visited_mask_txt[mask_txt] = 1
+            words = []
+            probs_words = []
+            for i in range(0,mask_count, self.word_length):
+                words.append(' '.join(mask[i: i + self.word_length]))
+                prob_word = 1.0
+                for j in range(i, i + self.word_length):
+                    prob_word *= probs[j]
+                probs_words.append(prob_word)
+            ret.append([words, prob, probs_words])
+        return sorted(ret, key=lambda x: x[1], reverse=True)[:k]
+    def extract_templateBs_batch(self, words_prob, tA, k, print_it = False):
+        words_prob_sorted = []
+        for (words, probA, *_) in words_prob:
+            tokenized_word = self.tokenizer(words[0])
+            words_prob_sorted.append([words,probA,len(tokenized_word['input_ids'])])
+        words_prob_sorted.sort(key=lambda x:x[2])
+        batch_size = 8
+        templates = []
+        index_words = {}
+        ret = {}
+        num_beams = k
+        for enum, (words, probA, *_) in enumerate(words_prob_sorted):
+            template = construct_template(words, tA, self.if_then)
+            templates.extend(template)
+            for t in template:
+                index_words[len(index_words)] = '\t'.join(words)
+            # index_words[len(templates)-1] = '\t'.join(words)
+            if (len(templates) == batch_size) or enum==len(words_prob_sorted)-1 or (words_prob_sorted[enum+1][2]!=words_prob_sorted[enum][2]):
+                generated_ids = self.tokenizer(templates, padding="longest", return_tensors='pt')['input_ids'].cuda()
+                generated_ret = self.orion_hypothesis_generator.generate(generated_ids, num_beams=num_beams,
+                                                    num_beam_groups=num_beams,
+                                                    max_length=28, #template_length+5,
+                                                    num_return_sequences=num_beams, min_length=3,
+                                                    diversity_penalty=1.0,
+                                                    early_stopping=True,
+                                                    #length_penalty = 0.1,
+                                                    bad_words_ids=self.bad_words_ids,
+                                                    #no_repeat_ngram_size=2,
+                                                    output_scores=True,
+                                                    return_dict_in_generate=True, decoder_ori_input_ids = generated_ids,
+                                                    top_p=0.95,
+                                                    )
+                summary_ids = generated_ret['sequences'].reshape((len(templates),num_beams,-1))
+                probs = F.softmax(generated_ret['sequences_scores'].reshape((len(templates),num_beams)),dim=1)
+                for ii in range(summary_ids.size(0)):
+                    txts = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
+                            summary_ids[ii]]
+                    ii_template = []
+                    words_ii = index_words[ii].split('\t')
+                    for i, txt in enumerate(txts):
+                        prob = probs[ii][i].item() * probA
+                        txt = txt.lower()
+                        txt = post_process_template(txt)
+                        words_ii_matched = [word.lower() for word in words_ii] #extract_similar_words(txt, words_ii)
+                        if words_ii_matched is None:
+                            prob = 0.0
+                        else:
+                            for j, word in enumerate(words_ii_matched):
+                                if word not in txt:
+                                    prob = 0.0
+                                else:
+                                    txt = txt.replace(word, '<ent{}>'.format(j), 1)
+                        if txt.count(' ')+1<=3:
+                            continue
+                        ii_template.append([txt, prob])
+                    # if print_it:
+                        # print(index_words[ii]+'\t'+str(convert_for_print(ii_template)))
+                    for template, prob in ii_template:
+                        if template not in ret:
+                            ret[template] = 0.0
+                        ret[template] += prob
+                templates.clear()
+                index_words.clear()
+        return ret
+    def generate_rule(self, tA, k=10, print_it = False):
+        tA=formalize_tA(tA)
+        if 'bart' in str(self.orion_instance_generator.__class__).lower():
+            words_prob = self.extract_words_for_tA_bart(tA, k,print_it=print_it)
+            words_prob = filter_words(words_prob)[:k]
+            # if print_it:
+                # print(convert_for_print(words_prob))
+        else:
+            words_prob = self.extract_words_for_tA(tA, k)
+            words_prob = filter_words(words_prob)[:k]
+        tB_prob = self.extract_templateBs_batch(words_prob, tA, k,print_it=print_it)
+        ret = []
+        for k1 in tB_prob:
+            ret.append([k1, tB_prob[k1]])
+        ret = sorted(ret, key=lambda x: x[1], reverse=True)[:k]
+        if self.if_then:
+            for i, temp in enumerate(ret):
+                sentence = temp[0]
+                if "then" in sentence:
+                    sentence = sentence.split("then")[-1]
+                else:
+                    sentence = sentence.replace("if", "")
+                ret[i][0] = sentence
+        return ret
+class CometInductor(object):
+    def __init__(self):
+        self.model = AutoModelForSeq2SeqLM.from_pretrained("adamlin/comet-atomic_2020_BART").cuda().eval() # .half()
+        self.tokenizer = AutoTokenizer.from_pretrained("adamlin/comet-atomic_2020_BART")
+        self.task = "summarization"
+        self.use_task_specific_params()
+        self.decoder_start_token_id = None
+    def drop_repeat(self, old_list):
+        new_list = []
+        for item in old_list:
+            if item not in new_list:
+                new_list.append(item)
+        return new_list
+    def chunks(self, lst, n):
+        """Yield successive n-sized chunks from lst."""
+        for i in range(0, len(lst), n):
+            yield lst[i : i + n]
+    def use_task_specific_params(self):
+        """Update config with summarization specific params."""
+        task_specific_params = self.model.config.task_specific_params
+        if task_specific_params is not None:
+            pars = task_specific_params.get(self.task, {})
+            self.model.config.update(pars)
+    def trim_batch(
+        self, input_ids, pad_token_id, attention_mask=None,
+    ):
+        """Remove columns that are populated exclusively by pad_token_id"""
+        keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
+        if attention_mask is None:
+            return input_ids[:, keep_column_mask]
+        else:
+            return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
+    def generate(self, inputs, k, topk):
+        outputs = []
+        words = ['PersonX', 'PersonY']
+        for i, _ in enumerate(re.findall("<mask>", inputs)):
+            index = inputs.index('<mask>')
+            inputs = inputs[:index] + words[i] + inputs[index + len('<mask>'):]
+        for relation in RELATIONS:
+            inputs = "{} {} [GEN]".format(inputs[:-1], relation)
+            gen = self.generate_(inputs, num_generate=10)
+            switch = 0
+            for output in gen[0]:
+                output = output.strip()
+                if re.search("PersonX|X", output) and re.search("PersonY|Y", output):
+                    temp = re.sub("PersonX|X|PersonY|Y", "<mask>", output.strip())
+                    if temp.endswith("."):
+                        outputs.append(temp)
+                    else:
+                        outputs.append(temp + ".")
+                    switch = 1
+                    break
+            if switch == 0:
+                output = gen[0][0]
+                temp = re.sub("PersonX|X|PersonY|Y", "<mask>", output.strip())
+                if temp.endswith("."):
+                    outputs.append(temp)
+                else:
+                    outputs.append(temp + ".")
+        outputs = [output.replace('PersonX', '<mask>').replace('PersonY', '<mask>') for output in outputs]
+        return outputs
+    def generate_(
+            self,
+            queries,
+            decode_method="beam",
+            num_generate=5,
+        ):
+        with torch.no_grad():
+            decs = []
+            batch = self.tokenizer(queries, return_tensors="pt", padding="longest")
+            input_ids, attention_mask = self.trim_batch(**batch, pad_token_id=self.tokenizer.pad_token_id)
+            summaries = self.model.generate(
+                input_ids=input_ids.cuda(),
+                attention_mask=attention_mask.cuda(),
+                decoder_start_token_id=self.decoder_start_token_id,
+                num_beams=num_generate,
+                num_return_sequences=num_generate,
+                )
+            dec = self.tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            decs.append(dec)
+            return decs

src/__pycache__/bart_with_group_beam.cpython-38.pyc ADDED Viewed

Binary file (17.7 kB). View file

src/bart_with_group_beam.py ADDED Viewed

	@@ -0,0 +1,608 @@

+from transformers.models.bart import BartForConditionalGeneration
+import torch
+from transformers.generation_beam_search import BeamScorer
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Optional, Tuple, Union, Dict, Any
+from transformers.generation_logits_process import LogitsProcessorList
+from transformers.generation_utils import BeamSearchEncoderDecoderOutput,BeamSearchDecoderOnlyOutput
+from torch.nn import functional as F
+from transformers.file_utils import ModelOutput
+import torch.nn
+BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
+class BartForConditionalGeneration_GroupBeam(BartForConditionalGeneration):
+    def beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[BeamSearchOutput, torch.LongTensor]:
+        r"""
+        Generates sequences for models with a language modeling head using beam search decoding.
+        Parameters:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty
+                :obj:`torch.LongTensor` of shape :obj:`(1,)`.
+            beam_scorer (:obj:`BeamScorer`):
+                An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+                constructed, stored and sorted during generation. For more information, the documentation of
+                :class:`~transformers.BeamScorer` should be read.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return trhe hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+        Return:
+            :class:`~transformers.generation_utilsBeamSearchDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+        Examples::
+            >>> from transformers import (
+            ...    AutoTokenizer,
+            ...    AutoModelForSeq2SeqLM,
+            ...    LogitsProcessorList,
+            ...    MinLengthLogitsProcessor,
+            ...    BeamSearchScorer,
+            ... )
+            >>> import torch
+            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+            >>> encoder_input_str = "translate English to German: How old are you?"
+            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+            >>> # lets run beam search using 3 beams
+            >>> num_beams = 3
+            >>> # define decoder start token ids
+            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+            >>> input_ids = input_ids * model.config.decoder_start_token_id
+            >>> # add encoder_outputs to model keyword arguments
+            >>> model_kwargs = {
+            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+            ... }
+            >>> # instantiate beam scorer
+            >>> beam_scorer = BeamSearchScorer(
+            ...     batch_size=1,
+            ...     max_length=model.config.max_length,
+            ...     num_beams=num_beams,
+            ...     device=model.device,
+            ... )
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+            ... ])
+            >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        batch_beam_size, cur_len = input_ids.shape
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), "Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            next_token_logits = outputs.logits[:, -1, :]
+            # adjust tokens for Bart, *e.g.*
+            next_token_logits = self.adjust_logits_during_generation(
+                next_token_logits, cur_len=cur_len, max_length=max_length
+            )
+            next_token_scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
+            next_token_scores = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+            #m = torch.nn.LayerNorm(num_beams * vocab_size)
+            #next_token_scores = m(next_token_scores)
+            next_token_scores_group = torch.sum(next_token_scores,dim=0,keepdim=True).expand(batch_size,-1) / batch_size
+            for i in range(next_token_scores.size(0)):
+                '''tmin = torch.min(next_token_scores_group[i])
+                for j in range(1,len(model_kwargs['decoder_ori_input_ids'][i])):
+                    next_token_scores_group[i][model_kwargs['decoder_ori_input_ids'][i][j]] = tmin'''
+                for t in model_kwargs['decoder_ori_input_ids'][i]:
+                    for j in range(num_beams):
+                    #if t not in input_ids[i] or t==1:
+                        next_token_scores_group[i][j * vocab_size + t] = next_token_scores[i][j * vocab_size + t]
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores_group, 2 * num_beams, dim=1, largest=True, sorted=True)
+            '''next_token_scores_group = next_token_scores_group.expand(batch_size,-1)
+            next_tokens_group = next_tokens_group.expand(batch_size,-1)
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+            )
+            for i in range(next_token_scores.size(0)):
+                j1 = 0
+                for j in range(next_token_scores.size(1)):
+                    if next_tokens[i][j] not in model_kwargs['decoder_ori_input_ids'][i]:
+                        next_tokens[i][j] = next_tokens_group[i][j1]
+                        j1 += 1
+            next_token_scores = next_token_scores_group
+            del next_token_scores_group, next_tokens_group'''
+            next_indices = next_tokens // vocab_size
+            next_tokens = next_tokens % vocab_size
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+            cur_len = cur_len + 1
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past"] is not None:
+                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)
+            if beam_scorer.is_done:
+                break
+        sequence_outputs = beam_scorer.finalize(
+            input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
+        )
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return BeamSearchEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return BeamSearchDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequence_outputs["sequences"]
+    def group_beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences for models with a language modeling head using beam search decoding.
+        Parameters:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty
+                :obj:`torch.LongTensor` of shape :obj:`(1,)`.
+            beam_scorer (:obj:`BeamScorer`):
+                An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+                constructed, stored and sorted during generation. For more information, the documentation of
+                :class:`~transformers.BeamScorer` should be read.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return trhe hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            model_kwargs:
+                Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+        Return:
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+        Examples::
+            >>> from transformers import (
+            ...    AutoTokenizer,
+            ...    AutoModelForSeq2SeqLM,
+            ...    LogitsProcessorList,
+            ...    MinLengthLogitsProcessor,
+            ...    HammingDiversityLogitsProcessor,
+            ...    BeamSearchScorer,
+            ... )
+            >>> import torch
+            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+            >>> encoder_input_str = "translate English to German: How old are you?"
+            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+            >>> # lets run diverse beam search using 6 beams
+            >>> num_beams = 6
+            >>> # define decoder start token ids
+            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+            >>> input_ids = input_ids * model.config.decoder_start_token_id
+            >>> # add encoder_outputs to model keyword arguments
+            >>> model_kwargs = {
+            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+            ... }
+            >>> # instantiate beam scorer
+            >>> beam_scorer = BeamSearchScorer(
+            ...     batch_size=1,
+            ...     max_length=model.config.max_length,
+            ...     num_beams=num_beams,
+            ...     device=model.device,
+            ...     num_beam_groups=3
+            ... )
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
+            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+            ... ])
+            >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        device = input_ids.device
+        batch_beam_size, cur_len = input_ids.shape
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        while cur_len < max_length:
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+                if output_scores:
+                    processed_score = torch.zeros_like(outputs.logits[:, -1, :]).half()  # .float()
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+                # select outputs of beams of current group only
+                next_token_logits = outputs.logits[batch_group_indices, -1, :]
+                # adjust tokens for Bart, *e.g.*
+                next_token_logits = self.adjust_logits_during_generation(
+                    next_token_logits, cur_len=cur_len, max_length=max_length
+                )
+                next_token_scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * group_size, vocab_size)
+                vocab_size = next_token_scores.shape[-1]
+                next_token_scores = logits_processor(
+                    group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores + beam_scores[batch_group_indices].unsqueeze(-1).expand_as(
+                    next_token_scores
+                )
+                if output_scores:
+                    processed_score[batch_group_indices] = next_token_scores.half()  # .float()
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+                ###
+                next_token_scores_group = torch.sum(next_token_scores, dim=0, keepdim=True).expand(batch_size,
+                                                                                                   -1) / batch_size
+                for i in range(next_token_scores.size(0)):
+                    '''tmin = torch.min(next_token_scores_group[i])
+                    for j in range(1,len(model_kwargs['decoder_ori_input_ids'][i])):
+                        next_token_scores_group[i][model_kwargs['decoder_ori_input_ids'][i][j]] = tmin'''
+                    for t in model_kwargs['decoder_ori_input_ids'][i]:
+                        for j in range(group_size):
+                            # if t not in input_ids[i] or t==1:
+                            next_token_scores_group[i][j * vocab_size + t] = next_token_scores[i][j * vocab_size + t]
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores_group, 2 * group_size, dim=1, largest=True, sorted=True)
+                ###
+                #next_token_scores, next_tokens = torch.topk(
+                #    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                #)
+                next_indices = next_tokens // vocab_size
+                next_tokens = next_tokens % vocab_size
+                # stateless
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size)
+                )
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (processed_score,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past"] is not None:
+                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], reordering_indices)
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            cur_len = cur_len + 1
+            if beam_scorer.is_done:
+                break
+        sequence_outputs = beam_scorer.finalize(
+            input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id, max_length=max_length,
+        )
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"]
+            if self.config.is_encoder_decoder:
+                return BeamSearchEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return BeamSearchDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequence_outputs["sequences"]

src/distinct_n/.gitignore ADDED Viewed

	@@ -0,0 +1,58 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+state.py
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/

src/distinct_n/.idea/Distinct-N.iml ADDED Viewed

	@@ -0,0 +1,11 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/distinct_n" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/docs" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.6 (Metrics)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

src/distinct_n/.idea/encodings.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>

src/distinct_n/.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (tensorflow)" project-jdk-type="Python SDK" />
+</project>

src/distinct_n/.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Distinct-N.iml" filepath="$PROJECT_DIR$/.idea/Distinct-N.iml" />
+    </modules>
+  </component>
+</project>

src/distinct_n/.idea/other.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PySciProjectComponent">
+    <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
+  </component>
+</project>

src/distinct_n/.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

src/distinct_n/.idea/webResources.xml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="WebResourcesPaths">
+    <contentEntries>
+      <entry url="file://$PROJECT_DIR$">
+        <entryData>
+          <resourceRoots>
+            <path value="file://$PROJECT_DIR$/testdata" />
+          </resourceRoots>
+        </entryData>
+      </entry>
+    </contentEntries>
+  </component>
+</project>

src/distinct_n/A Diversity-Promoting Objective Function for Neural Conversation Models.pdf ADDED Viewed

Binary file (200 kB). View file

src/distinct_n/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

src/distinct_n/README.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Distinct-N
+Distinct-N, most notably distinct-1 and distinct-2, is metric that measures the
+diversity of a sentence. It focuses on the number of *distinct* n-gram of a sentence and thus
+penalizes sentences with lots of repeated words. The metric is free of any *reference* or *ground truth*
+sentence and devotes totally to the property of a sentence (generated by the system).
+It is proposed by Jiwei Li et.al in the paper *A Diversity-Promoting Objective Function for Neural Conversation Models*.
+# Definitions
+The original paper coined *Distinct-N* as:
+    We report degree of diversity by calculating the number of distinct unigrams and bigrams in generated responses.
+    The value is scaled by total number of generated tokens to avoid favoring long sentences
+which is exactly what we have mentioned before.
+# Usage
+```bash
+$ python distinct_metric.py -n N_NGRAMS PREDICTION
+```
+where `N_GRAMS` is the length of token sequence to count as unique within one sentence.
+`PREDICTION` is the prediction or response your model generates with one utterance (sentence) per line.
+# Dependencies
+`python>=3.6.1`
+# References
+[1] A Diversity-Promoting Objective Function for Neural Conversation Models

src/distinct_n/bin/distinct_metric.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import argparse
+import logging
+from distinct_n import distinct_n_sentence_level
+from pathlib import Path
+from agenda.metric_helper import write_score
+NAME = 'distinct_n'
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('hypothesis', help="predicted text file, one example per line")
+    parser.add_argument('-n', dest='n_range', type=int, nargs='+', help="n to use as in distinct-N")
+    parser.add_argument('--output_dir')
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+    logging.info('loading hypothesis file...')
+    with open(args.hypothesis) as f:
+        hypothesis = [sentence.split() for sentence in f.readlines()]
+    output_dir = Path(args.output_dir)
+    for n in args.n_range:
+        write_score(
+            name=NAME,
+            output=output_dir.joinpath(f'{NAME}_{n}').with_suffix('.json'),
+            params={'n': n},
+            scores=[distinct_n_sentence_level(s, n) for s in hypothesis],
+        )

src/distinct_n/bin/score.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env bash
+HYPO=/home/cgsdfc/UbuntuDialogueCorpus/ResponseContextPairs/ModelPredictions/VHRED/First_VHRED_BeamSearch_5_GeneratedTestResponses.txt_First.txt
+DIR=/home/cgsdfc/Result/Test
+python bin/distinct_metric.py --output_dir $DIR $HYPO -n 3

src/distinct_n/distinct_n/metrics.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from src.distinct_n.distinct_n.utils import ngrams
+__all__ = ["distinct_n_sentence_level", "distinct_n_corpus_level"]
+def distinct_n_sentence_level(sentence, n):
+    """
+    Compute distinct-N for a single sentence.
+    :param sentence: a list of words.
+    :param n: int, ngram.
+    :return: float, the metric value.
+    """
+    if len(sentence) == 0:
+        return 0.0  # Prevent a zero division
+    # distinct_ngrams = set(ngrams(sentence, n))
+    # print(ngrams(sentence, n))
+    return list(set(ngrams(sentence, n)))
+    # return len(distinct_ngrams) / len(sentence)
+def distinct_n_corpus_level(sentences, n):
+    """
+    Compute average distinct-N of a list of sentences (the corpus).
+    :param sentences: a list of sentence.
+    :param n: int, ngram.
+    :return: float, the average value.
+    """
+    temp = []
+    length = 0
+    for sentence in sentences:
+        length += len(sentence)
+        temp.extend(distinct_n_sentence_level(sentence, n))
+    return len(set(temp)) / length

src/distinct_n/distinct_n/test.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import unittest
+from distinct_n import distinct_n_sentence_level
+from distinct_n import distinct_n_corpus_level
+class TestDistinctN(unittest.TestCase):
+    def test_unigram(self):
+        sentence = "the the the the the".split()
+        self.assertAlmostEqual(
+            distinct_n_sentence_level(sentence, 1), 0.2
+        )
+        sentence = "the the the the cat".split()
+        self.assertAlmostEqual(
+            distinct_n_sentence_level(sentence, 1), 0.4
+        )
+    def test_bigram(self):
+        sentence = "the cat sat on the".split()
+        self.assertAlmostEqual(
+            distinct_n_sentence_level(sentence, 2), 0.8
+        )
+    def test_corpus_level(self):
+        sentences = [
+            'the cat sat on the mat'.split(),
+            'mat the on sat cat the'.split(),
+            'i do not know'.split(),
+            'Sorry but i do not know'.split(),
+        ]
+        self.assertAlmostEqual(0.916666, distinct_n_corpus_level(sentences, 1), delta=1e-5)
+        self.assertAlmostEqual(0.8125, distinct_n_corpus_level(sentences, 2), delta=1e-5)

src/distinct_n/distinct_n/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Copied from nltk.ngrams().
+"""
+from itertools import chain
+__all__ = ["ngrams"]
+def pad_sequence(sequence, n, pad_left=False, pad_right=False,
+                 left_pad_symbol=None, right_pad_symbol=None):
+    """
+    Returns a padded sequence of items before ngram extraction.
+        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
+        ['<s>', 1, 2, 3, 4, 5, '</s>']
+        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
+        ['<s>', 1, 2, 3, 4, 5]
+        >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
+        [1, 2, 3, 4, 5, '</s>']
+    :param sequence: the source data to be padded
+    :type sequence: sequence or iter
+    :param n: the degree of the ngrams
+    :type n: int
+    :param pad_left: whether the ngrams should be left-padded
+    :type pad_left: bool
+    :param pad_right: whether the ngrams should be right-padded
+    :type pad_right: bool
+    :param left_pad_symbol: the symbol to use for left padding (default is None)
+    :type left_pad_symbol: any
+    :param right_pad_symbol: the symbol to use for right padding (default is None)
+    :type right_pad_symbol: any
+    :rtype: sequence or iter
+    """
+    sequence = iter(sequence)
+    if pad_left:
+        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
+    if pad_right:
+        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
+    return sequence
+def ngrams(sequence, n, pad_left=False, pad_right=False,
+           left_pad_symbol=None, right_pad_symbol=None):
+    """
+    Return the ngrams generated from a sequence of items, as an iterator.
+    For example:
+        >>> from nltk.util import ngrams
+        >>> list(ngrams([1,2,3,4,5], 3))
+        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
+    Wrap with list for a list version of this function.  Set pad_left
+    or pad_right to true in order to get additional ngrams:
+        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
+        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
+        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
+        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
+        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
+        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
+        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
+        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
+    :param sequence: the source data to be converted into ngrams
+    :type sequence: sequence or iter
+    :param n: the degree of the ngrams
+    :type n: int
+    :param pad_left: whether the ngrams should be left-padded
+    :type pad_left: bool
+    :param pad_right: whether the ngrams should be right-padded
+    :type pad_right: bool
+    :param left_pad_symbol: the symbol to use for left padding (default is None)
+    :type left_pad_symbol: any
+    :param right_pad_symbol: the symbol to use for right padding (default is None)
+    :type right_pad_symbol: any
+    :rtype: sequence or iter
+    """
+    sequence = pad_sequence(sequence, n, pad_left, pad_right,
+                            left_pad_symbol, right_pad_symbol)
+    history = []
+    while n > 1:
+        history.append(next(sequence))
+        n -= 1
+    for item in sequence:
+        history.append(item)
+        yield tuple(history)
+        del history[0]

src/distinct_n/setup.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from setuptools import setup
+__version__ = '0.4.0'
+setup(
+    name='Distinct_N',
+    version=__version__,
+    description='Distinct-N metric that measures degree of diversity of generated response',
+    url='https://github.com/neural-dialogue-metrics/Distinct-N.git',
+    author='cgsdfc',
+    author_email='cgsdfc@126.com',
+    keywords=[
+        'NL', 'CL', 'MT',
+        'natural language processing',
+        'computational linguistics',
+        'machine translation',
+    ],
+    packages=['distinct_n'],
+    scripts=['bin/distinct_metric.py'],
+    classifiers=[
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache-v2',
+        'Programming Language :: Python :: 3',
+        'Topic :: Text Processing :: Linguistic',
+    ],
+    license='LICENCE.txt',
+    long_description=open('README.md').read(),
+    install_requires=[],
+)

src/distinct_n/testdata/bigram.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ the cat sat on the mat

src/distinct_n/testdata/unigram.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ the the the the a

src/utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from ngram import NGram
+def post_process_template(tB):
+    if tB.endswith('.') == False:
+        tB += '.'
+    return tB
+    # return tB.split('.')[0] + '.'
+def construct_template(words, templateA, if_then=False):
+    if len(words) == 2:
+        # template = ['{} <mask> {}.'.format(words[0], words[1])]
+        templates = [
+            # '{} is <mask> {}.'.format(words[0], words[1]),
+            '{} <mask> {}.'.format(words[0], words[1]),
+        ]
+    elif len(words) == 1:
+        templates = [
+            # '{} is <mask>.'.format(words[0]),
+            '{} <mask>.'.format(words[0])]
+    elif len(words) == 0:
+        templates = []
+    if if_then:
+        for word in words:
+            index = templateA.index('<mask>')
+            templateA = templateA[:index] + word + templateA[index + len('<mask>'):]
+        templates = ['If ' + templateA + ' then ' + template for template in templates]
+    return templates
+def filter_words(words_prob):
+    word_count = {}
+    token1_count = {}
+    word2_count = {}
+    ret = []
+    for words, prob, *_ in words_prob:
+        filter_this = False
+        # filter repetitive token
+        token_count = {}
+        for word in words:
+            for token in word.split(' '):
+                if token in token_count:
+                    filter_this = True
+                token_count[token] = 1
+        if filter_this:
+            prob *= 0.5
+        # filter repetitive words
+        if len(words) == 2 and words[0] == words[1]:
+            continue
+        # filter repetitive first token
+        token1 = words[0].split(' ')[0]
+        if token1 not in token1_count:
+            token1_count[token1] = 1
+        else:
+            token1_count[token1] += 1
+            prob /= token1_count[token1]
+        for word in words:
+            if word not in word_count:
+                word_count[word] = 0
+            word_count[word] += 1
+            prob /= word_count[word]
+        if len(words) == 2:
+            if words[1] not in word2_count:
+                word2_count[words[1]] = 0
+            word2_count[words[1]] += 1
+            prob /= word2_count[words[1]]
+        ret.append([words, prob])
+    return sorted(ret, key=lambda x: x[1], reverse=True)
+import math
+from copy import deepcopy
+def convert_for_print(arr):
+    ret = deepcopy(arr)
+    for i in range(len(ret)):
+        ret[i][1] = round(ret[i][1], 7)
+        if len(ret[i]) == 3:
+            for j in range(len(ret[i][2])):
+                ret[i][2][j] = round(ret[i][2][j], 7)
+    return ret
+def formalize_tA(tA):
+    tA = tA.strip()
+    if tA.endswith('.'):
+        tA = tA[:-1].strip() + '.'
+    else:
+        tA += '.'
+    tA = tA.replace(' ,', ',')
+    tA = tA.replace(" '", "'")
+    return tA
+ngram_n = 3
+def extract_similar_words(txt, words):
+    max_word_length = 0
+    for word in words:
+        if len(word) > max_word_length:
+            max_word_length = len(word)
+    txt_ngrams = []
+    for i in range(len(txt)):
+        for j in range(i + ngram_n, min(len(txt), i + max_word_length + 5)):
+            txt_ngrams.append(txt[i:j].lower())
+    n = NGram(txt_ngrams, key=lambda x: x.lower(), N=ngram_n)
+    ret = []
+    for word in words:
+        matched_word = n.find(word.lower(), 0.5)
+        if matched_word is None:
+            return None
+        ret.append(matched_word)
+    return ret
+def extract_words(txt, words):
+    for word in words:
+        if word not in txt:
+            return None
+    return [word.lower() for word in words]