larkkin commited on Mar 6

Commit

c45d283

•

1 Parent(s): 4d8e00f

Add code and readme

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +70 -0
app.py +97 -0
config/__init__.py +0 -0
config/params.py +89 -0
data/__init__.py +0 -0
data/batch.py +95 -0
data/dataset.py +245 -0
data/field/__init__.py +0 -0
data/field/anchor_field.py +19 -0
data/field/anchored_label_field.py +38 -0
data/field/basic_field.py +11 -0
data/field/bert_field.py +18 -0
data/field/edge_field.py +63 -0
data/field/edge_label_field.py +67 -0
data/field/field.py +70 -0
data/field/label_field.py +36 -0
data/field/mini_torchtext/example.py +100 -0
data/field/mini_torchtext/field.py +637 -0
data/field/mini_torchtext/pipeline.py +86 -0
data/field/mini_torchtext/utils.py +256 -0
data/field/mini_torchtext/vocab.py +116 -0
data/field/nested_field.py +50 -0
data/parser/__init__.py +0 -0
data/parser/from_mrp/__init__.py +0 -0
data/parser/from_mrp/abstract_parser.py +50 -0
data/parser/from_mrp/evaluation_parser.py +18 -0
data/parser/from_mrp/labeled_edge_parser.py +70 -0
data/parser/from_mrp/node_centric_parser.py +69 -0
data/parser/from_mrp/request_parser.py +23 -0
data/parser/from_mrp/sequential_parser.py +90 -0
data/parser/json_parser.py +35 -0
data/parser/to_mrp/__init__.py +0 -0
data/parser/to_mrp/abstract_parser.py +80 -0
data/parser/to_mrp/labeled_edge_parser.py +52 -0
data/parser/to_mrp/node_centric_parser.py +35 -0
data/parser/to_mrp/sequential_parser.py +35 -0
model/__init__.py +0 -0
model/head/__init__.py +0 -0
model/head/abstract_head.py +274 -0
model/head/labeled_edge_head.py +67 -0
model/head/node_centric_head.py +25 -0
model/head/sequential_head.py +24 -0
model/model.py +82 -0
model/module/__init__.py +0 -0
model/module/anchor_classifier.py +32 -0
model/module/biaffine.py +20 -0
model/module/bilinear.py +43 -0
model/module/char_embedding.py +42 -0
model/module/edge_classifier.py +56 -0
model/module/encoder.py +95 -0

README.md CHANGED Viewed

@@ -1,3 +1,73 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+datasets:
+- ltg/norec
+language:
+- 'no'
+pipeline_tag: token-classification
+model-index:
+  - name: SSA-Perin
+    results:
+      - task:
+          type: structured sentiment analysis
+        dataset:
+          name: NoReC
+          type: NoReC
+        metrics:
+          - name: Unlabeled sentiment tuple F1
+            type: Unlabeled sentiment tuple F1
+            value: 44.12%
+          - name: Target F1
+            type: Target F1
+            value: 56.44%
+          - name: Relative polarity precision
+            type: Relative polarity precision
+            value: 93.19%
 ---
+This repository contains a pretrained model (and an easy-to-run wrapper for it) for structured sentiment analysis in Norwegian language, pre-trained on the [NoReC_fine dataset](https://github.com/ltgoslo/norec_fine).
+This is an implementation of the method described in
+```bibtex
+@misc{samuel2022direct,
+      title={Direct parsing to sentiment graphs},
+      author={David Samuel and Jeremy Barnes and Robin Kurtz and Stephan Oepen and Lilja Øvrelid and Erik Velldal},
+      year={2022},
+      eprint={2203.13209},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+The main repository that also contains the scripts for training the model, can be found on the project [github](https://github.com/jerbarnes/direct_parsing_to_sent_graph).
+The model is also available in the form of a [HF space](https://huggingface.co/spaces/ltg/ssa-perin).
+  The sentiment graph model is based on an underlying masked language model – [NorBERT 2](https://huggingface.co/ltg/norbert2).
+The proposed method suggests three different ways to encode the sentiment graph: "node-centric", "labeled-edge", and "opinion-tuple".
+The current model
+- uses "labeled-edge" graph encoding
+- does not use character-level embedding
+- all other hyperparameters are set to [default values](https://github.com/jerbarnes/direct_parsing_to_sent_graph/blob/main/perin/config/edge_norec.yaml)
+, and it achieves the following results on the held-out set of the dataset:
+| Unlabeled sentiment tuple F1 | Target F1  | Relative polarity precision |
+|:----------------------------:|:----------:|:---------------------------:|
+|     0.434                    |  0.541      |        0.926                |
+The model can be easily used for predicting sentiment tuples as follows:
+```python
+>>> import model_wrapper
+>>> model = model_wrapper.PredictionModel()
+>>> model.predict(['vi liker svart kaffe'])
+[{'sent_id': '0',
+  'text': 'vi liker svart kaffe',
+  'opinions': [{'Source': [['vi'], ['0:2']],
+    'Target': [['svart', 'kaffe'], ['9:14', '15:20']],
+    'Polar_expression': [['liker'], ['3:8']],
+    'Polarity': 'Positive'}]}]
+```

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import gradio as gr
+import model_wrapper
+model = model_wrapper.PredictionModel()
+def pretty_print_opinion(opinion_dict):
+    res = []
+    maxlen = max([len(key) for key in opinion_dict.keys()]) + 2
+    maxlen = 0
+    for key, value in opinion_dict.items():
+        if key == 'Polarity':
+            res.append(f'{(key + ":").ljust(maxlen)} {value}')
+        else:
+            res.append(f'{(key + ":").ljust(maxlen)} \'{" ".join(value[0])}\'')
+    return '\n'.join(res) + '\n'
+def predict(text):
+    print(f'Input message "{text}"')
+    try:
+        predictions = model([text])
+        prediction = predictions[0]
+        results = []
+        if not prediction['opinions']:
+            return 'No opinions detected'
+        for opinion in prediction['opinions']:
+            results.append(pretty_print_opinion(opinion))
+        print(f'Successfully predicted SA for input message "{text}": {results}')
+        return '\n'.join(results)
+    except Exception as e:
+        print(f'Error for input message "{text}": {e}')
+        raise e
+markdown_text = '''
+<br>
+<br>
+This space provides a gradio demo and an easy-to-run wrapper of the pre-trained model for structured sentiment analysis in Norwegian language, pre-trained on the [NoReC dataset](https://huggingface.co/datasets/norec).
+This space containt an implementation of method described in "Direct parsing to sentiment graphs" (Samuel _et al._, ACL 2022). The main repository that also contains the scripts for training the model, can be found on the project [github](https://github.com/jerbarnes/direct_parsing_to_sent_graph).
+The sentiment graph model is based on an underlying masked language model – [NorBERT 2](https://huggingface.co/ltg/norbert2).
+The proposed method suggests three different ways to encode the sentiment graph: "node-centric", "labeled-edge", and "opinion-tuple".
+The current model
+- uses "labeled-edge" graph encoding
+- does not use character-level embedding
+- all other hyperparameters are set to [default values](https://github.com/jerbarnes/direct_parsing_to_sent_graph/blob/main/perin/config/edge_norec.yaml)
+, and it achieves the following results on the held-out set of the NoReC dataset:
+| Unlabeled sentiment tuple F1 | Target F1  | Relative polarity precision |
+|:----------------------------:|:----------:|:---------------------------:|
+|     0.434                    |  0.541      |        0.926                |
+In "Word Substitution with Masked Language Models as Data Augmentation for Sentiment Analysis", we analyzed data augmentation strategies for improving performance of the model. Using masked-language modeling (MLM), we augmented the sentences with MLM-substituted words inside, outside, or inside+outside the actual sentiment tuples. The results below show that augmentation may be improve the model performance. This space, however, runs the original model trained without augmentation.
+|                | Augmentation rate | Unlabeled sentiment tuple F1 | Target F1 | Relative polarity precision |
+|----------------|-------------------|------------------------------|-----------|-----------------------------|
+| Baseline       | 0%               | 43.39                        | 54.13     | 92.59                       |
+| Outside        | 59%              | **45.08**                    | 56.18     | 92.95                       |
+| Inside         | 9%               | 43.38                        | 55.62     | 92.49                       |
+| Inside+Outside | 27%              | 44.12                        | **56.44** | **93.19**               |
+The model can be easily used for predicting sentiment tuples as follows:
+```python
+>>> import model_wrapper
+>>> model = model_wrapper.PredictionModel()
+>>> model.predict(['vi liker svart kaffe'])
+[{'sent_id': '0',
+  'text': 'vi liker svart kaffe',
+  'opinions': [{'Source': [['vi'], ['0:2']],
+    'Target': [['svart', 'kaffe'], ['9:14', '15:20']],
+    'Polar_expression': [['liker'], ['3:8']],
+    'Polarity': 'Positive'}]}]
+```
+'''
+with gr.Blocks() as demo:
+    with gr.Row() as row:
+        text_input = gr.Textbox(label="input")
+        text_output = gr.Textbox(label="output")
+    with gr.Row() as row:
+        text_button = gr.Button("submit")
+    text_button.click(fn=predict, inputs=text_input, outputs=text_output)
+    gr.Markdown(markdown_text)
+demo.launch()

config/__init__.py ADDED Viewed

File without changes

config/params.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import yaml
+class Params:
+    def __init__(self):
+        self.graph_mode = "sequential"               # possibilities: {sequential, node-centric, edge-labeled}
+        self.accumulation_steps = 1                  # number of gradient accumulation steps for achieving a bigger batch_size
+        self.activation = "relu"                     # transformer (decoder) activation function, supported values: {'relu', 'gelu', 'sigmoid', 'mish'}
+        self.predict_intensity = False
+        self.batch_size = 32                         # batch size (further divided into multiple GPUs)
+        self.beta_2 = 0.98                           # beta 2 parameter for Adam(W) optimizer
+        self.blank_weight = 1.0                      # weight of cross-entropy loss for predicting an empty label
+        self.char_embedding = True                   # use character embedding in addition to bert
+        self.char_embedding_size = 128               # dimension of the character embedding layer in the character embedding module
+        self.decoder_delay_steps = 0                 # number of initial steps with frozen decoder
+        self.decoder_learning_rate = 6e-4            # initial decoder learning rate
+        self.decoder_weight_decay = 1.2e-6           # amount of weight decay
+        self.dropout_anchor = 0.5                    # dropout at the last layer of anchor classifier
+        self.dropout_edge_label = 0.5                # dropout at the last layer of edge label classifier
+        self.dropout_edge_presence = 0.5             # dropout at the last layer of edge presence classifier
+        self.dropout_label = 0.5                     # dropout at the last layer of label classifier
+        self.dropout_transformer = 0.5               # dropout for the transformer layers (decoder)
+        self.dropout_transformer_attention = 0.1     # dropout for the transformer's attention (decoder)
+        self.dropout_word = 0.1                      # probability of dropping out a whole word from the encoder (in favour of char embedding)
+        self.encoder = "xlm-roberta-base"            # pretrained encoder model
+        self.encoder_delay_steps = 2000              # number of initial steps with frozen XLM-R
+        self.encoder_freeze_embedding = True         # freeze the first embedding layer in XLM-R
+        self.encoder_learning_rate = 6e-5            # initial encoder learning rate
+        self.encoder_weight_decay = 1e-2             # amount of weight decay
+        self.lr_decay_multiplier = 100
+        self.epochs = 100                            # number of epochs for train
+        self.focal = True                            # use focal loss for the label prediction
+        self.freeze_bert = False                     # use focal loss for the label prediction
+        self.group_ops = False                       # group 'opN' edge labels into one
+        self.hidden_size_ff = 4 * 768                # hidden size of the transformer feed-forward submodule
+        self.hidden_size_anchor = 128                # hidden size anchor biaffine layer
+        self.hidden_size_edge_label = 256            # hidden size for edge label biaffine layer
+        self.hidden_size_edge_presence = 512         # hidden size for edge label biaffine layer
+        self.layerwise_lr_decay = 1.0                # layerwise decay of learning rate in the encoder
+        self.n_attention_heads = 8                   # number of attention heads in the decoding transformer
+        self.n_layers = 3                            # number of layers in the decoder
+        self.query_length = 4                        # number of queries genereted for each word on the input
+        self.pre_norm = True                         # use pre-normalized version of the transformer (as in Transformers without Tears)
+        self.warmup_steps = 6000                     # number of the warm-up steps for the inverse_sqrt scheduler
+    def init_data_paths(self):
+        directory_1 = {
+            "sequential": "node_centric_mrp",
+            "node-centric": "node_centric_mrp",
+            "labeled-edge": "labeled_edge_mrp"
+        }[self.graph_mode]
+        directory_2 = {
+            ("darmstadt", "en"): "darmstadt_unis",
+            ("mpqa", "en"): "mpqa",
+            ("multibooked", "ca"): "multibooked_ca",
+            ("multibooked", "eu"): "multibooked_eu",
+            ("norec", "no"): "norec",
+            ("opener", "en"): "opener_en",
+            ("opener", "es"): "opener_es",
+        }[(self.framework, self.language)]
+        self.training_data = f"{self.data_directory}/{directory_1}/{directory_2}/train.mrp"
+        self.validation_data = f"{self.data_directory}/{directory_1}/{directory_2}/dev.mrp"
+        self.test_data = f"{self.data_directory}/{directory_1}/{directory_2}/test.mrp"
+        self.raw_training_data = f"{self.data_directory}/raw/{directory_2}/train.json"
+        self.raw_validation_data = f"{self.data_directory}/raw/{directory_2}/dev.json"
+        return self
+    def load_state_dict(self, d):
+        for k, v in d.items():
+            setattr(self, k, v)
+        return self
+    def state_dict(self):
+        members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")]
+        return {k: self.__dict__[k] for k in members}
+    def load(self, args):
+        with open(args.config, "r", encoding="utf-8") as f:
+            params = yaml.safe_load(f)
+            self.load_state_dict(params)
+        self.init_data_paths()
+    def save(self, json_path):
+        with open(json_path, "w", encoding="utf-8") as f:
+            d = self.state_dict()
+            yaml.dump(d, f)

data/__init__.py ADDED Viewed

File without changes

data/batch.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+import torch.nn.functional as F
+class Batch:
+    @staticmethod
+    def build(data):
+        fields = list(data[0].keys())
+        transposed = {}
+        for field in fields:
+            if isinstance(data[0][field], tuple):
+                transposed[field] = tuple(Batch._stack(field, [example[field][i] for example in data]) for i in range(len(data[0][field])))
+            else:
+                transposed[field] = Batch._stack(field, [example[field] for example in data])
+        return transposed
+    @staticmethod
+    def _stack(field: str, examples):
+        if field == "anchored_labels":
+            return examples
+        dim = examples[0].dim()
+        if dim == 0:
+            return torch.stack(examples)
+        lengths = [max(example.size(i) for example in examples) for i in range(dim)]
+        if any(length == 0 for length in lengths):
+            return torch.LongTensor(len(examples), *lengths)
+        examples = [F.pad(example, Batch._pad_size(example, lengths)) for example in examples]
+        return torch.stack(examples)
+    @staticmethod
+    def _pad_size(example, total_size):
+        return [p for i, l in enumerate(total_size[::-1]) for p in (0, l - example.size(-1 - i))]
+    @staticmethod
+    def index_select(batch, indices):
+        filtered_batch = {}
+        for key, examples in batch.items():
+            if isinstance(examples, list) or isinstance(examples, tuple):
+                filtered_batch[key] = [example.index_select(0, indices) for example in examples]
+            else:
+                filtered_batch[key] = examples.index_select(0, indices)
+        return filtered_batch
+    @staticmethod
+    def to_str(batch):
+        string = "\n".join([f"\t{name}: {Batch._short_str(item)}" for name, item in batch.items()])
+        return string
+    @staticmethod
+    def to(batch, device):
+        converted = {}
+        for field in batch.keys():
+            converted[field] = Batch._to(batch[field], device)
+        return converted
+    @staticmethod
+    def _short_str(tensor):
+        # unwrap variable to tensor
+        if not torch.is_tensor(tensor):
+            # (1) unpack variable
+            if hasattr(tensor, "data"):
+                tensor = getattr(tensor, "data")
+            # (2) handle include_lengths
+            elif isinstance(tensor, tuple) or isinstance(tensor, list):
+                return str(tuple(Batch._short_str(t) for t in tensor))
+            # (3) fallback to default str
+            else:
+                return str(tensor)
+        # copied from torch _tensor_str
+        size_str = "x".join(str(size) for size in tensor.size())
+        device_str = "" if not tensor.is_cuda else " (GPU {})".format(tensor.get_device())
+        strt = "[{} of size {}{}]".format(torch.typename(tensor), size_str, device_str)
+        return strt
+    @staticmethod
+    def _to(tensor, device):
+        if not torch.is_tensor(tensor):
+            if isinstance(tensor, tuple):
+                return tuple(Batch._to(t, device) for t in tensor)
+            elif isinstance(tensor, list):
+                return [Batch._to(t, device) for t in tensor]
+            else:
+                raise Exception(f"unsupported type of {tensor} to be casted to cuda")
+        return tensor.to(device, non_blocking=True)

data/dataset.py ADDED Viewed

	@@ -0,0 +1,245 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import pickle
+import torch
+from data.parser.from_mrp.node_centric_parser import NodeCentricParser
+from data.parser.from_mrp.labeled_edge_parser import LabeledEdgeParser
+from data.parser.from_mrp.sequential_parser import SequentialParser
+from data.parser.from_mrp.evaluation_parser import EvaluationParser
+from data.parser.from_mrp.request_parser import RequestParser
+from data.field.edge_field import EdgeField
+from data.field.edge_label_field import EdgeLabelField
+from data.field.field import Field
+from data.field.mini_torchtext.field import Field as TorchTextField
+from data.field.label_field import LabelField
+from data.field.anchored_label_field import AnchoredLabelField
+from data.field.nested_field import NestedField
+from data.field.basic_field import BasicField
+from data.field.bert_field import BertField
+from data.field.anchor_field import AnchorField
+from data.batch import Batch
+def char_tokenize(word):
+    return [c for i, c in enumerate(word)]  # if i < 10 or len(word) - i <= 10]
+class Collate:
+    def __call__(self, batch):
+        batch.sort(key=lambda example: example["every_input"][0].size(0), reverse=True)
+        return Batch.build(batch)
+class Dataset:
+    def __init__(self, args, verbose=True):
+        self.verbose = verbose
+        self.sos, self.eos, self.pad, self.unk = "<sos>", "<eos>", "<pad>", "<unk>"
+        self.bert_input_field = BertField()
+        self.scatter_field = BasicField()
+        self.every_word_input_field = Field(lower=True, init_token=self.sos, eos_token=self.eos, batch_first=True, include_lengths=True)
+        char_form_nesting = TorchTextField(tokenize=char_tokenize, init_token=self.sos, eos_token=self.eos, batch_first=True)
+        self.char_form_field = NestedField(char_form_nesting, include_lengths=True)
+        self.label_field = LabelField(preprocessing=lambda nodes: [n["label"] for n in nodes])
+        self.anchored_label_field = AnchoredLabelField()
+        self.id_field = Field(batch_first=True, tokenize=lambda x: [x])
+        self.edge_presence_field = EdgeField()
+        self.edge_label_field = EdgeLabelField()
+        self.anchor_field = AnchorField()
+        self.source_anchor_field = AnchorField()
+        self.target_anchor_field = AnchorField()
+        self.token_interval_field = BasicField()
+        self.load_dataset(args)
+    def log(self, text):
+        if not self.verbose:
+            return
+        print(text, flush=True)
+    def load_state_dict(self, args, d):
+        for key, value in d["vocabs"].items():
+            getattr(self, key).vocab = pickle.loads(value)
+    def state_dict(self):
+        return {
+            "vocabs": {key: pickle.dumps(value.vocab) for key, value in self.__dict__.items() if hasattr(value, "vocab")}
+        }
+    def load_sentences(self, sentences, args):
+        dataset = RequestParser(
+            sentences, args,
+            fields={
+                "input": [("every_input", self.every_word_input_field), ("char_form_input", self.char_form_field)],
+                "bert input": ("input", self.bert_input_field),
+                "to scatter": ("input_scatter", self.scatter_field),
+                "token anchors": ("token_intervals", self.token_interval_field),
+                "id": ("id", self.id_field),
+            },
+        )
+        self.every_word_input_field.build_vocab(dataset, min_freq=1, specials=[self.pad, self.unk, self.sos, self.eos])
+        self.id_field.build_vocab(dataset, min_freq=1, specials=[])
+        return torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=Collate())
+    def load_dataset(self, args):
+        parser = {
+            "sequential": SequentialParser,
+            "node-centric": NodeCentricParser,
+            "labeled-edge": LabeledEdgeParser
+        }[args.graph_mode]
+        train = parser(
+            args, "training",
+            fields={
+                "input": [("every_input", self.every_word_input_field), ("char_form_input", self.char_form_field)],
+                "bert input": ("input", self.bert_input_field),
+                "to scatter": ("input_scatter", self.scatter_field),
+                "nodes": ("labels", self.label_field),
+                "anchored labels": ("anchored_labels", self.anchored_label_field),
+                "edge presence": ("edge_presence", self.edge_presence_field),
+                "edge labels": ("edge_labels", self.edge_label_field),
+                "anchor edges": ("anchor", self.anchor_field),
+                "source anchor edges": ("source_anchor", self.source_anchor_field),
+                "target anchor edges": ("target_anchor", self.target_anchor_field),
+                "token anchors": ("token_intervals", self.token_interval_field),
+                "id": ("id", self.id_field),
+            },
+            filter_pred=lambda example: len(example.input) <= 256,
+        )
+        val = parser(
+            args, "validation",
+            fields={
+                "input": [("every_input", self.every_word_input_field), ("char_form_input", self.char_form_field)],
+                "bert input": ("input", self.bert_input_field),
+                "to scatter": ("input_scatter", self.scatter_field),
+                "nodes": ("labels", self.label_field),
+                "anchored labels": ("anchored_labels", self.anchored_label_field),
+                "edge presence": ("edge_presence", self.edge_presence_field),
+                "edge labels": ("edge_labels", self.edge_label_field),
+                "anchor edges": ("anchor", self.anchor_field),
+                "source anchor edges": ("source_anchor", self.source_anchor_field),
+                "target anchor edges": ("target_anchor", self.target_anchor_field),
+                "token anchors": ("token_intervals", self.token_interval_field),
+                "id": ("id", self.id_field),
+            },
+        )
+        test = EvaluationParser(
+            args,
+            fields={
+                "input": [("every_input", self.every_word_input_field), ("char_form_input", self.char_form_field)],
+                "bert input": ("input", self.bert_input_field),
+                "to scatter": ("input_scatter", self.scatter_field),
+                "token anchors": ("token_intervals", self.token_interval_field),
+                "id": ("id", self.id_field),
+            },
+        )
+        del train.data, val.data, test.data  # TODO: why?
+        for f in list(train.fields.values()) + list(val.fields.values()) + list(test.fields.values()):  # TODO: why?
+            if hasattr(f, "preprocessing"):
+                del f.preprocessing
+        self.train_size = len(train)
+        self.val_size = len(val)
+        self.test_size = len(test)
+        self.log(f"\n{self.train_size} sentences in the train split")
+        self.log(f"{self.val_size} sentences in the validation split")
+        self.log(f"{self.test_size} sentences in the test split")
+        self.node_count = train.node_counter
+        self.token_count = train.input_count
+        self.edge_count = train.edge_counter
+        self.no_edge_count = train.no_edge_counter
+        self.anchor_freq = train.anchor_freq
+        self.source_anchor_freq = train.source_anchor_freq if hasattr(train, "source_anchor_freq") else 0.5
+        self.target_anchor_freq = train.target_anchor_freq if hasattr(train, "target_anchor_freq") else 0.5
+        self.log(f"{self.node_count} nodes in the train split")
+        self.every_word_input_field.build_vocab(val, test, min_freq=1, specials=[self.pad, self.unk, self.sos, self.eos])
+        self.char_form_field.build_vocab(train, min_freq=1, specials=[self.pad, self.unk, self.sos, self.eos])
+        self.char_form_field.nesting_field.vocab = self.char_form_field.vocab
+        self.id_field.build_vocab(train, val, test, min_freq=1, specials=[])
+        self.label_field.build_vocab(train)
+        self.anchored_label_field.vocab = self.label_field.vocab
+        self.edge_label_field.build_vocab(train)
+        print(list(self.edge_label_field.vocab.freqs.keys()), flush=True)
+        self.char_form_vocab_size = len(self.char_form_field.vocab)
+        self.create_label_freqs(args)
+        self.create_edge_freqs(args)
+        self.log(f"Edge frequency: {self.edge_presence_freq*100:.2f} %")
+        self.log(f"{len(self.label_field.vocab)} words in the label vocabulary")
+        self.log(f"{len(self.anchored_label_field.vocab)} words in the anchored label vocabulary")
+        self.log(f"{len(self.edge_label_field.vocab)} words in the edge label vocabulary")
+        self.log(f"{len(self.char_form_field.vocab)} characters in the vocabulary")
+        self.log(self.label_field.vocab.freqs)
+        self.log(self.anchored_label_field.vocab.freqs)
+        self.train = torch.utils.data.DataLoader(
+            train,
+            batch_size=args.batch_size,
+            shuffle=True,
+            num_workers=args.workers,
+            collate_fn=Collate(),
+            pin_memory=True,
+            drop_last=True
+        )
+        self.train_size = len(self.train.dataset)
+        self.val = torch.utils.data.DataLoader(
+            val,
+            batch_size=args.batch_size,
+            shuffle=False,
+            num_workers=args.workers,
+            collate_fn=Collate(),
+            pin_memory=True,
+        )
+        self.val_size = len(self.val.dataset)
+        self.test = torch.utils.data.DataLoader(
+            test,
+            batch_size=args.batch_size,
+            shuffle=False,
+            num_workers=args.workers,
+            collate_fn=Collate(),
+            pin_memory=True,
+        )
+        self.test_size = len(self.test.dataset)
+        if self.verbose:
+            batch = next(iter(self.train))
+            print(f"\nBatch content: {Batch.to_str(batch)}\n")
+            print(flush=True)
+    def create_label_freqs(self, args):
+        n_rules = len(self.label_field.vocab)
+        blank_count = (args.query_length * self.token_count - self.node_count)
+        label_counts = [blank_count] + [
+            self.label_field.vocab.freqs[self.label_field.vocab.itos[i]]
+            for i in range(n_rules)
+        ]
+        label_counts = torch.FloatTensor(label_counts)
+        self.label_freqs = label_counts / (self.node_count + blank_count)
+        self.log(f"Label frequency: {self.label_freqs}")
+    def create_edge_freqs(self, args):
+        edge_counter = [
+            self.edge_label_field.vocab.freqs[self.edge_label_field.vocab.itos[i]] for i in range(len(self.edge_label_field.vocab))
+        ]
+        edge_counter = torch.FloatTensor(edge_counter)
+        self.edge_label_freqs = edge_counter / self.edge_count
+        self.edge_presence_freq = self.edge_count / (self.edge_count + self.no_edge_count)

data/field/__init__.py ADDED Viewed

File without changes

data/field/anchor_field.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.field.mini_torchtext.field import RawField
+class AnchorField(RawField):
+    def process(self, batch, device=None):
+        tensors, masks = self.pad(batch, device)
+        return tensors, masks
+    def pad(self, anchors, device):
+        tensor = torch.zeros(anchors[0], anchors[1], dtype=torch.long, device=device)
+        for anchor in anchors[-1]:
+            tensor[anchor[0], anchor[1]] = 1
+        mask = tensor.sum(-1) == 0
+        return tensor, mask

data/field/anchored_label_field.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+from data.field.mini_torchtext.field import RawField
+class AnchoredLabelField(RawField):
+    def __init__(self):
+        super(AnchoredLabelField, self).__init__()
+        self.vocab = None
+    def process(self, example, device=None):
+        example = self.numericalize(example)
+        tensor = self.pad(example, device)
+        return tensor
+    def pad(self, example, device):
+        n_labels = len(self.vocab)
+        n_nodes, n_tokens = len(example[1]), example[0]
+        tensor = torch.full([n_nodes, n_tokens, n_labels + 1], 0, dtype=torch.long, device=device)
+        for i_node, node in enumerate(example[1]):
+            for anchor, rule in node:
+                tensor[i_node, anchor, rule + 1] = 1
+        return tensor
+    def numericalize(self, arr):
+        def multi_map(array, function):
+            if isinstance(array, tuple):
+                return (array[0], function(array[1]))
+            elif isinstance(array, list):
+                return [multi_map(a, function) for a in array]
+            else:
+                return array
+        if self.vocab is not None:
+            arr = multi_map(arr, lambda x: self.vocab.stoi[x] if x in self.vocab.stoi else 0)
+        return arr

data/field/basic_field.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.field.mini_torchtext.field import RawField
+class BasicField(RawField):
+    def process(self, example, device=None):
+        tensor = torch.tensor(example, dtype=torch.long, device=device)
+        return tensor

data/field/bert_field.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.field.mini_torchtext.field import RawField
+class BertField(RawField):
+    def __init__(self):
+        super(BertField, self).__init__()
+    def process(self, example, device=None):
+        attention_mask = [1] * len(example)
+        example = torch.LongTensor(example, device=device)
+        attention_mask = torch.ones_like(example)
+        return example, attention_mask

data/field/edge_field.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.field.mini_torchtext.field import RawField
+from data.field.mini_torchtext.vocab import Vocab
+from collections import Counter
+import types
+class EdgeField(RawField):
+    def __init__(self):
+        super(EdgeField, self).__init__()
+        self.vocab = None
+    def process(self, edges, device=None):
+        edges = self.numericalize(edges)
+        tensor = self.pad(edges, device)
+        return tensor
+    def pad(self, edges, device):
+        tensor = torch.zeros(edges[0], edges[1], dtype=torch.long, device=device)
+        for edge in edges[-1]:
+            tensor[edge[0], edge[1]] = edge[2]
+        return tensor
+    def numericalize(self, arr):
+        def multi_map(array, function):
+            if isinstance(array, tuple):
+                return (array[0], array[1], function(array[2]))
+            elif isinstance(array, list):
+                return [multi_map(array[i], function) for i in range(len(array))]
+            else:
+                return array
+        if self.vocab is not None:
+            arr = multi_map(arr, lambda x: self.vocab.stoi[x] if x is not None else 0)
+        return arr
+    def build_vocab(self, *args):
+        def generate(l):
+            if isinstance(l, tuple):
+                yield l[2]
+            elif isinstance(l, list) or isinstance(l, types.GeneratorType):
+                for i in l:
+                    yield from generate(i)
+            else:
+                return
+        counter = Counter()
+        sources = []
+        for arg in args:
+            if isinstance(arg, torch.utils.data.Dataset):
+                sources += [arg.get_examples(name) for name, field in arg.fields.items() if field is self]
+            else:
+                sources.append(arg)
+        for x in generate(sources):
+            if x is not None:
+                counter.update([x])
+        self.vocab = Vocab(counter, specials=[])

data/field/edge_label_field.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.field.mini_torchtext.field import RawField
+from data.field.mini_torchtext.vocab import Vocab
+from collections import Counter
+import types
+class EdgeLabelField(RawField):
+    def process(self, edges, device=None):
+        edges, masks = self.numericalize(edges)
+        edges, masks = self.pad(edges, masks, device)
+        return edges, masks
+    def pad(self, edges, masks, device):
+        n_labels = len(self.vocab)
+        tensor = torch.zeros(edges[0], edges[1], n_labels, dtype=torch.long, device=device)
+        mask_tensor = torch.zeros(edges[0], edges[1], dtype=torch.bool, device=device)
+        for edge in edges[-1]:
+            tensor[edge[0], edge[1], edge[2]] = 1
+        for mask in masks[-1]:
+            mask_tensor[mask[0], mask[1]] = mask[2]
+        return tensor, mask_tensor
+    def numericalize(self, arr):
+        def multi_map(array, function):
+            if isinstance(array, tuple):
+                return (array[0], array[1], function(array[2]))
+            elif isinstance(array, list):
+                return [multi_map(array[i], function) for i in range(len(array))]
+            else:
+                return array
+        mask = multi_map(arr, lambda x: x is None)
+        arr = multi_map(arr, lambda x: self.vocab.stoi[x] if x in self.vocab.stoi else 0)
+        return arr, mask
+    def build_vocab(self, *args):
+        def generate(l):
+            if isinstance(l, tuple):
+                yield l[2]
+            elif isinstance(l, list) or isinstance(l, types.GeneratorType):
+                for i in l:
+                    yield from generate(i)
+            else:
+                return
+        counter = Counter()
+        sources = []
+        for arg in args:
+            if isinstance(arg, torch.utils.data.Dataset):
+                sources += [arg.get_examples(name) for name, field in arg.fields.items() if field is self]
+            else:
+                sources.append(arg)
+        for x in generate(sources):
+            if x is not None:
+                counter.update([x])
+        self.vocab = Vocab(counter, specials=[])

data/field/field.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+from data.field.mini_torchtext.field import Field as TorchTextField
+from collections import Counter, OrderedDict
+# small change of vocab building to correspond to our version of Dataset
+class Field(TorchTextField):
+    def build_vocab(self, *args, **kwargs):
+        counter = Counter()
+        sources = []
+        for arg in args:
+            if isinstance(arg, torch.utils.data.Dataset):
+                sources += [arg.get_examples(name) for name, field in arg.fields.items() if field is self]
+            else:
+                sources.append(arg)
+        for data in sources:
+            for x in data:
+                if not self.sequential:
+                    x = [x]
+                counter.update(x)
+        specials = list(
+            OrderedDict.fromkeys(
+                tok
+                for tok in [self.unk_token, self.pad_token, self.init_token, self.eos_token] + kwargs.pop("specials", [])
+                if tok is not None
+            )
+        )
+        self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
+    def process(self, example, device=None):
+        if self.include_lengths:
+            example = example, len(example)
+        tensor = self.numericalize(example, device=device)
+        return tensor
+    def numericalize(self, ex, device=None):
+        if self.include_lengths and not isinstance(ex, tuple):
+            raise ValueError("Field has include_lengths set to True, but input data is not a tuple of (data batch, batch lengths).")
+        if isinstance(ex, tuple):
+            ex, lengths = ex
+            lengths = torch.tensor(lengths, dtype=self.dtype, device=device)
+        if self.use_vocab:
+            if self.sequential:
+                ex = [self.vocab.stoi[x] for x in ex]
+            else:
+                ex = self.vocab.stoi[ex]
+            if self.postprocessing is not None:
+                ex = self.postprocessing(ex, self.vocab)
+        else:
+            numericalization_func = self.dtypes[self.dtype]
+            if not self.sequential:
+                ex = numericalization_func(ex) if isinstance(ex, str) else ex
+            if self.postprocessing is not None:
+                ex = self.postprocessing(ex, None)
+        var = torch.tensor(ex, dtype=self.dtype, device=device)
+        if self.sequential and not self.batch_first:
+            var.t_()
+        if self.sequential:
+            var = var.contiguous()
+        if self.include_lengths:
+            return var, lengths
+        return var

data/field/label_field.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+from data.field.mini_torchtext.field import RawField
+from data.field.mini_torchtext.vocab import Vocab
+from collections import Counter
+class LabelField(RawField):
+    def __self__(self, preprocessing):
+        super(LabelField, self).__init__(preprocessing=preprocessing)
+        self.vocab = None
+    def build_vocab(self, *args, **kwargs):
+        sources = []
+        for arg in args:
+            if isinstance(arg, torch.utils.data.Dataset):
+                sources += [arg.get_examples(name) for name, field in arg.fields.items() if field is self]
+            else:
+                sources.append(arg)
+        counter = Counter()
+        for data in sources:
+            for x in data:
+                counter.update(x)
+        self.vocab = Vocab(counter, specials=[])
+    def process(self, example, device=None):
+        tensor, lengths = self.numericalize(example, device=device)
+        return tensor, lengths
+    def numericalize(self, example, device=None):
+        example = [self.vocab.stoi[x] + 1 for x in example]
+        length = torch.LongTensor([len(example)], device=device).squeeze(0)
+        tensor = torch.LongTensor(example, device=device)
+        return tensor, length

data/field/mini_torchtext/example.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import six
+import json
+from functools import reduce
+class Example(object):
+    """Defines a single training or test example.
+    Stores each column of the example as an attribute.
+    """
+    @classmethod
+    def fromJSON(cls, data, fields):
+        ex = cls()
+        obj = json.loads(data)
+        for key, vals in fields.items():
+            if vals is not None:
+                if not isinstance(vals, list):
+                    vals = [vals]
+                for val in vals:
+                    # for processing the key likes 'foo.bar'
+                    name, field = val
+                    ks = key.split('.')
+                    def reducer(obj, key):
+                        if isinstance(obj, list):
+                            results = []
+                            for data in obj:
+                                if key not in data:
+                                    # key error
+                                    raise ValueError("Specified key {} was not found in "
+                                                     "the input data".format(key))
+                                else:
+                                    results.append(data[key])
+                            return results
+                        else:
+                            # key error
+                            if key not in obj:
+                                raise ValueError("Specified key {} was not found in "
+                                                 "the input data".format(key))
+                            else:
+                                return obj[key]
+                    v = reduce(reducer, ks, obj)
+                    setattr(ex, name, field.preprocess(v))
+        return ex
+    @classmethod
+    def fromdict(cls, data, fields):
+        ex = cls()
+        for key, vals in fields.items():
+            if key not in data:
+                raise ValueError("Specified key {} was not found in "
+                                 "the input data".format(key))
+            if vals is not None:
+                if not isinstance(vals, list):
+                    vals = [vals]
+                for val in vals:
+                    name, field = val
+                    setattr(ex, name, field.preprocess(data[key]))
+        return ex
+    @classmethod
+    def fromCSV(cls, data, fields, field_to_index=None):
+        if field_to_index is None:
+            return cls.fromlist(data, fields)
+        else:
+            assert(isinstance(fields, dict))
+            data_dict = {f: data[idx] for f, idx in field_to_index.items()}
+            return cls.fromdict(data_dict, fields)
+    @classmethod
+    def fromlist(cls, data, fields):
+        ex = cls()
+        for (name, field), val in zip(fields, data):
+            if field is not None:
+                if isinstance(val, six.string_types):
+                    val = val.rstrip('\n')
+                # Handle field tuples
+                if isinstance(name, tuple):
+                    for n, f in zip(name, field):
+                        setattr(ex, n, f.preprocess(val))
+                else:
+                    setattr(ex, name, field.preprocess(val))
+        return ex
+    @classmethod
+    def fromtree(cls, data, fields, subtrees=False):
+        try:
+            from nltk.tree import Tree
+        except ImportError:
+            print("Please install NLTK. "
+                  "See the docs at http://nltk.org for more information.")
+            raise
+        tree = Tree.fromstring(data)
+        if subtrees:
+            return [cls.fromlist(
+                [' '.join(t.leaves()), t.label()], fields) for t in tree.subtrees()]
+        return cls.fromlist([' '.join(tree.leaves()), tree.label()], fields)

data/field/mini_torchtext/field.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding: utf8
+from collections import Counter, OrderedDict
+from itertools import chain
+import six
+import torch
+from .pipeline import Pipeline
+from .utils import get_tokenizer, dtype_to_attr, is_tokenizer_serializable
+from .vocab import Vocab
+class RawField(object):
+    """ Defines a general datatype.
+    Every dataset consists of one or more types of data. For instance, a text
+    classification dataset contains sentences and their classes, while a
+    machine translation dataset contains paired examples of text in two
+    languages. Each of these types of data is represented by a RawField object.
+    A RawField object does not assume any property of the data type and
+    it holds parameters relating to how a datatype should be processed.
+    Attributes:
+        preprocessing: The Pipeline that will be applied to examples
+            using this field before creating an example.
+            Default: None.
+        postprocessing: A Pipeline that will be applied to a list of examples
+            using this field before assigning to a batch.
+            Function signature: (batch(list)) -> object
+            Default: None.
+        is_target: Whether this field is a target variable.
+            Affects iteration over batches. Default: False
+    """
+    def __init__(self, preprocessing=None, postprocessing=None, is_target=False):
+        self.preprocessing = preprocessing
+        self.postprocessing = postprocessing
+        self.is_target = is_target
+    def preprocess(self, x):
+        """ Preprocess an example if the `preprocessing` Pipeline is provided. """
+        if hasattr(self, "preprocessing") and self.preprocessing is not None:
+            return self.preprocessing(x)
+        else:
+            return x
+    def process(self, batch, *args, **kwargs):
+        """ Process a list of examples to create a batch.
+        Postprocess the batch with user-provided Pipeline.
+        Args:
+            batch (list(object)): A list of object from a batch of examples.
+        Returns:
+            object: Processed object given the input and custom
+            postprocessing Pipeline.
+        """
+        if self.postprocessing is not None:
+            batch = self.postprocessing(batch)
+        return batch
+class Field(RawField):
+    """Defines a datatype together with instructions for converting to Tensor.
+    Field class models common text processing datatypes that can be represented
+    by tensors.  It holds a Vocab object that defines the set of possible values
+    for elements of the field and their corresponding numerical representations.
+    The Field object also holds other parameters relating to how a datatype
+    should be numericalized, such as a tokenization method and the kind of
+    Tensor that should be produced.
+    If a Field is shared between two columns in a dataset (e.g., question and
+    answer in a QA dataset), then they will have a shared vocabulary.
+    Attributes:
+        sequential: Whether the datatype represents sequential data. If False,
+            no tokenization is applied. Default: True.
+        use_vocab: Whether to use a Vocab object. If False, the data in this
+            field should already be numerical. Default: True.
+        init_token: A token that will be prepended to every example using this
+            field, or None for no initial token. Default: None.
+        eos_token: A token that will be appended to every example using this
+            field, or None for no end-of-sentence token. Default: None.
+        fix_length: A fixed length that all examples using this field will be
+            padded to, or None for flexible sequence lengths. Default: None.
+        dtype: The torch.dtype class that represents a batch of examples
+            of this kind of data. Default: torch.long.
+        preprocessing: The Pipeline that will be applied to examples
+            using this field after tokenizing but before numericalizing. Many
+            Datasets replace this attribute with a custom preprocessor.
+            Default: None.
+        postprocessing: A Pipeline that will be applied to examples using
+            this field after numericalizing but before the numbers are turned
+            into a Tensor. The pipeline function takes the batch as a list, and
+            the field's Vocab.
+            Default: None.
+        lower: Whether to lowercase the text in this field. Default: False.
+        tokenize: The function used to tokenize strings using this field into
+            sequential examples. If "spacy", the SpaCy tokenizer is
+            used. If a non-serializable function is passed as an argument,
+            the field will not be able to be serialized. Default: string.split.
+        tokenizer_language: The language of the tokenizer to be constructed.
+            Various languages currently supported only in SpaCy.
+        include_lengths: Whether to return a tuple of a padded minibatch and
+            a list containing the lengths of each examples, or just a padded
+            minibatch. Default: False.
+        batch_first: Whether to produce tensors with the batch dimension first.
+            Default: False.
+        pad_token: The string token used as padding. Default: "<pad>".
+        unk_token: The string token used to represent OOV words. Default: "<unk>".
+        pad_first: Do the padding of the sequence at the beginning. Default: False.
+        truncate_first: Do the truncating of the sequence at the beginning. Default: False
+        stop_words: Tokens to discard during the preprocessing step. Default: None
+        is_target: Whether this field is a target variable.
+            Affects iteration over batches. Default: False
+    """
+    vocab_cls = Vocab
+    # Dictionary mapping PyTorch tensor dtypes to the appropriate Python
+    # numeric type.
+    dtypes = {
+        torch.float32: float,
+        torch.float: float,
+        torch.float64: float,
+        torch.double: float,
+        torch.float16: float,
+        torch.half: float,
+        torch.uint8: int,
+        torch.int8: int,
+        torch.int16: int,
+        torch.short: int,
+        torch.int32: int,
+        torch.int: int,
+        torch.int64: int,
+        torch.long: int,
+    }
+    ignore = ['dtype', 'tokenize']
+    def __init__(self, sequential=True, use_vocab=True, init_token=None,
+                 eos_token=None, fix_length=None, dtype=torch.long,
+                 preprocessing=None, postprocessing=None, lower=False,
+                 tokenize=None, tokenizer_language='en', include_lengths=False,
+                 batch_first=False, pad_token="<pad>", unk_token="<unk>",
+                 pad_first=False, truncate_first=False, stop_words=None,
+                 is_target=False):
+        self.sequential = sequential
+        self.use_vocab = use_vocab
+        self.init_token = init_token
+        self.eos_token = eos_token
+        self.unk_token = unk_token
+        self.fix_length = fix_length
+        self.dtype = dtype
+        self.preprocessing = preprocessing
+        self.postprocessing = postprocessing
+        self.lower = lower
+        # store params to construct tokenizer for serialization
+        # in case the tokenizer isn't picklable (e.g. spacy)
+        self.tokenizer_args = (tokenize, tokenizer_language)
+        self.tokenize = get_tokenizer(tokenize, tokenizer_language)
+        self.include_lengths = include_lengths
+        self.batch_first = batch_first
+        self.pad_token = pad_token if self.sequential else None
+        self.pad_first = pad_first
+        self.truncate_first = truncate_first
+        try:
+            self.stop_words = set(stop_words) if stop_words is not None else None
+        except TypeError:
+            raise ValueError("Stop words must be convertible to a set")
+        self.is_target = is_target
+    def __getstate__(self):
+        str_type = dtype_to_attr(self.dtype)
+        if is_tokenizer_serializable(*self.tokenizer_args):
+            tokenize = self.tokenize
+        else:
+            # signal to restore in `__setstate__`
+            tokenize = None
+        attrs = {k: v for k, v in self.__dict__.items() if k not in self.ignore}
+        attrs['dtype'] = str_type
+        attrs['tokenize'] = tokenize
+        return attrs
+    def __setstate__(self, state):
+        state['dtype'] = getattr(torch, state['dtype'])
+        if not state['tokenize']:
+            state['tokenize'] = get_tokenizer(*state['tokenizer_args'])
+        self.__dict__.update(state)
+    def __hash__(self):
+        # we don't expect this to be called often
+        return 42
+    def __eq__(self, other):
+        if not isinstance(other, RawField):
+            return False
+        return self.__dict__ == other.__dict__
+    def preprocess(self, x):
+        """Load a single example using this field, tokenizing if necessary.
+        If the input is a Python 2 `str`, it will be converted to Unicode
+        first. If `sequential=True`, it will be tokenized. Then the input
+        will be optionally lowercased and passed to the user-provided
+        `preprocessing` Pipeline."""
+        if (six.PY2 and isinstance(x, six.string_types)
+                and not isinstance(x, six.text_type)):
+            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
+        if self.sequential and isinstance(x, six.text_type):
+            x = self.tokenize(x.rstrip('\n'))
+        if self.lower:
+            x = Pipeline(six.text_type.lower)(x)
+        if self.sequential and self.use_vocab and self.stop_words is not None:
+            x = [w for w in x if w not in self.stop_words]
+        if hasattr(self, "preprocessing") and self.preprocessing is not None:
+            return self.preprocessing(x)
+        else:
+            return x
+    def process(self, batch, device=None):
+        """ Process a list of examples to create a torch.Tensor.
+        Pad, numericalize, and postprocess a batch and create a tensor.
+        Args:
+            batch (list(object)): A list of object from a batch of examples.
+        Returns:
+            torch.autograd.Variable: Processed object given the input
+            and custom postprocessing Pipeline.
+        """
+        padded = self.pad(batch)
+        tensor = self.numericalize(padded, device=device)
+        return tensor
+    def pad(self, minibatch):
+        """Pad a batch of examples using this field.
+        Pads to self.fix_length if provided, otherwise pads to the length of
+        the longest example in the batch. Prepends self.init_token and appends
+        self.eos_token if those attributes are not None. Returns a tuple of the
+        padded list and a list containing lengths of each example if
+        `self.include_lengths` is `True` and `self.sequential` is `True`, else just
+        returns the padded list. If `self.sequential` is `False`, no padding is applied.
+        """
+        minibatch = list(minibatch)
+        if not self.sequential:
+            return minibatch
+        if self.fix_length is None:
+            max_len = max(len(x) for x in minibatch)
+        else:
+            max_len = self.fix_length + (
+                self.init_token, self.eos_token).count(None) - 2
+        padded, lengths = [], []
+        for x in minibatch:
+            if self.pad_first:
+                padded.append(
+                    [self.pad_token] * max(0, max_len - len(x))
+                    + ([] if self.init_token is None else [self.init_token])
+                    + list(x[-max_len:] if self.truncate_first else x[:max_len])
+                    + ([] if self.eos_token is None else [self.eos_token]))
+            else:
+                padded.append(
+                    ([] if self.init_token is None else [self.init_token])
+                    + list(x[-max_len:] if self.truncate_first else x[:max_len])
+                    + ([] if self.eos_token is None else [self.eos_token])
+                    + [self.pad_token] * max(0, max_len - len(x)))
+            lengths.append(len(padded[-1]) - max(0, max_len - len(x)))
+        if self.include_lengths:
+            return (padded, lengths)
+        return padded
+    def build_vocab(self, *args, **kwargs):
+        """Construct the Vocab object for this field from one or more datasets.
+        Arguments:
+            Positional arguments: Dataset objects or other iterable data
+                sources from which to construct the Vocab object that
+                represents the set of possible values for this field. If
+                a Dataset object is provided, all columns corresponding
+                to this field are used; individual columns can also be
+                provided directly.
+            Remaining keyword arguments: Passed to the constructor of Vocab.
+        """
+        counter = Counter()
+        sources = []
+        for arg in args:
+            sources.append(arg)
+        for data in sources:
+            for x in data:
+                if not self.sequential:
+                    x = [x]
+                try:
+                    counter.update(x)
+                except TypeError:
+                    counter.update(chain.from_iterable(x))
+        specials = list(OrderedDict.fromkeys(
+            tok for tok in [self.unk_token, self.pad_token, self.init_token,
+                            self.eos_token] + kwargs.pop('specials', [])
+            if tok is not None))
+        self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
+    def numericalize(self, arr, device=None):
+        """Turn a batch of examples that use this field into a Variable.
+        If the field has include_lengths=True, a tensor of lengths will be
+        included in the return value.
+        Arguments:
+            arr (List[List[str]], or tuple of (List[List[str]], List[int])):
+                List of tokenized and padded examples, or tuple of List of
+                tokenized and padded examples and List of lengths of each
+                example if self.include_lengths is True.
+            device (str or torch.device): A string or instance of `torch.device`
+                specifying which device the Variables are going to be created on.
+                If left as default, the tensors will be created on cpu. Default: None.
+        """
+        if self.include_lengths and not isinstance(arr, tuple):
+            raise ValueError("Field has include_lengths set to True, but "
+                             "input data is not a tuple of "
+                             "(data batch, batch lengths).")
+        if isinstance(arr, tuple):
+            arr, lengths = arr
+            lengths = torch.tensor(lengths, dtype=self.dtype, device=device)
+        if self.use_vocab:
+            if self.sequential:
+                arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
+            else:
+                arr = [self.vocab.stoi[x] for x in arr]
+            if self.postprocessing is not None:
+                arr = self.postprocessing(arr, self.vocab)
+        else:
+            if self.dtype not in self.dtypes:
+                raise ValueError(
+                    "Specified Field dtype {} can not be used with "
+                    "use_vocab=False because we do not know how to numericalize it. "
+                    "Please raise an issue at "
+                    "https://github.com/pytorch/text/issues".format(self.dtype))
+            numericalization_func = self.dtypes[self.dtype]
+            # It doesn't make sense to explicitly coerce to a numeric type if
+            # the data is sequential, since it's unclear how to coerce padding tokens
+            # to a numeric type.
+            if not self.sequential:
+                arr = [numericalization_func(x) if isinstance(x, six.string_types)
+                       else x for x in arr]
+            if self.postprocessing is not None:
+                arr = self.postprocessing(arr, None)
+        var = torch.tensor(arr, dtype=self.dtype, device=device)
+        if self.sequential and not self.batch_first:
+            var.t_()
+        if self.sequential:
+            var = var.contiguous()
+        if self.include_lengths:
+            return var, lengths
+        return var
+class NestedField(Field):
+    """A nested field.
+    A nested field holds another field (called *nesting field*), accepts an untokenized
+    string or a list string tokens and groups and treats them as one field as described
+    by the nesting field. Every token will be preprocessed, padded, etc. in the manner
+    specified by the nesting field. Note that this means a nested field always has
+    ``sequential=True``. The two fields' vocabularies will be shared. Their
+    numericalization results will be stacked into a single tensor. And NestedField will
+    share the same include_lengths with nesting_field, so one shouldn't specify the
+    include_lengths in the nesting_field. This field is
+    primarily used to implement character embeddings. See ``tests/data/test_field.py``
+    for examples on how to use this field.
+    Arguments:
+        nesting_field (Field): A field contained in this nested field.
+        use_vocab (bool): Whether to use a Vocab object. If False, the data in this
+            field should already be numerical. Default: ``True``.
+        init_token (str): A token that will be prepended to every example using this
+            field, or None for no initial token. Default: ``None``.
+        eos_token (str): A token that will be appended to every example using this
+            field, or None for no end-of-sentence token. Default: ``None``.
+        fix_length (int): A fixed length that all examples using this field will be
+            padded to, or ``None`` for flexible sequence lengths. Default: ``None``.
+        dtype: The torch.dtype class that represents a batch of examples
+            of this kind of data. Default: ``torch.long``.
+        preprocessing (Pipeline): The Pipeline that will be applied to examples
+            using this field after tokenizing but before numericalizing. Many
+            Datasets replace this attribute with a custom preprocessor.
+            Default: ``None``.
+        postprocessing (Pipeline): A Pipeline that will be applied to examples using
+            this field after numericalizing but before the numbers are turned
+            into a Tensor. The pipeline function takes the batch as a list, and
+            the field's Vocab. Default: ``None``.
+        include_lengths: Whether to return a tuple of a padded minibatch and
+            a list containing the lengths of each examples, or just a padded
+            minibatch. Default: False.
+        tokenize: The function used to tokenize strings using this field into
+            sequential examples. If "spacy", the SpaCy tokenizer is
+            used. If a non-serializable function is passed as an argument,
+            the field will not be able to be serialized. Default: string.split.
+        tokenizer_language: The language of the tokenizer to be constructed.
+            Various languages currently supported only in SpaCy.
+        pad_token (str): The string token used as padding. If ``nesting_field`` is
+            sequential, this will be set to its ``pad_token``. Default: ``"<pad>"``.
+        pad_first (bool): Do the padding of the sequence at the beginning. Default:
+            ``False``.
+    """
+    def __init__(self, nesting_field, use_vocab=True, init_token=None, eos_token=None,
+                 fix_length=None, dtype=torch.long, preprocessing=None,
+                 postprocessing=None, tokenize=None, tokenizer_language='en',
+                 include_lengths=False, pad_token='<pad>',
+                 pad_first=False, truncate_first=False):
+        if isinstance(nesting_field, NestedField):
+            raise ValueError('nesting field must not be another NestedField')
+        if nesting_field.include_lengths:
+            raise ValueError('nesting field cannot have include_lengths=True')
+        if nesting_field.sequential:
+            pad_token = nesting_field.pad_token
+        super(NestedField, self).__init__(
+            use_vocab=use_vocab,
+            init_token=init_token,
+            eos_token=eos_token,
+            fix_length=fix_length,
+            dtype=dtype,
+            preprocessing=preprocessing,
+            postprocessing=postprocessing,
+            lower=nesting_field.lower,
+            tokenize=tokenize,
+            tokenizer_language=tokenizer_language,
+            batch_first=True,
+            pad_token=pad_token,
+            unk_token=nesting_field.unk_token,
+            pad_first=pad_first,
+            truncate_first=truncate_first,
+            include_lengths=include_lengths
+        )
+        self.nesting_field = nesting_field
+        # in case the user forget to do that
+        self.nesting_field.batch_first = True
+    def preprocess(self, xs):
+        """Preprocess a single example.
+        Firstly, tokenization and the supplied preprocessing pipeline is applied. Since
+        this field is always sequential, the result is a list. Then, each element of
+        the list is preprocessed using ``self.nesting_field.preprocess`` and the resulting
+        list is returned.
+        Arguments:
+            xs (list or str): The input to preprocess.
+        Returns:
+            list: The preprocessed list.
+        """
+        return [self.nesting_field.preprocess(x)
+                for x in super(NestedField, self).preprocess(xs)]
+    def pad(self, minibatch):
+        """Pad a batch of examples using this field.
+        If ``self.nesting_field.sequential`` is ``False``, each example in the batch must
+        be a list of string tokens, and pads them as if by a ``Field`` with
+        ``sequential=True``. Otherwise, each example must be a list of list of tokens.
+        Using ``self.nesting_field``, pads the list of tokens to
+        ``self.nesting_field.fix_length`` if provided, or otherwise to the length of the
+        longest list of tokens in the batch. Next, using this field, pads the result by
+        filling short examples with ``self.nesting_field.pad_token``.
+        Example:
+            >>> import pprint
+            >>> pp = pprint.PrettyPrinter(indent=4)
+            >>>
+            >>> nesting_field = Field(pad_token='<c>', init_token='<w>', eos_token='</w>')
+            >>> field = NestedField(nesting_field, init_token='<s>', eos_token='</s>')
+            >>> minibatch = [
+            ...     [list('john'), list('loves'), list('mary')],
+            ...     [list('mary'), list('cries')],
+            ... ]
+            >>> padded = field.pad(minibatch)
+            >>> pp.pprint(padded)
+            [   [   ['<w>', '<s>', '</w>', '<c>', '<c>', '<c>', '<c>'],
+                    ['<w>', 'j', 'o', 'h', 'n', '</w>', '<c>'],
+                    ['<w>', 'l', 'o', 'v', 'e', 's', '</w>'],
+                    ['<w>', 'm', 'a', 'r', 'y', '</w>', '<c>'],
+                    ['<w>', '</s>', '</w>', '<c>', '<c>', '<c>', '<c>']],
+                [   ['<w>', '<s>', '</w>', '<c>', '<c>', '<c>', '<c>'],
+                    ['<w>', 'm', 'a', 'r', 'y', '</w>', '<c>'],
+                    ['<w>', 'c', 'r', 'i', 'e', 's', '</w>'],
+                    ['<w>', '</s>', '</w>', '<c>', '<c>', '<c>', '<c>'],
+                    ['<c>', '<c>', '<c>', '<c>', '<c>', '<c>', '<c>']]]
+        Arguments:
+            minibatch (list): Each element is a list of string if
+                ``self.nesting_field.sequential`` is ``False``, a list of list of string
+                otherwise.
+        Returns:
+            list: The padded minibatch. or (padded, sentence_lens, word_lengths)
+        """
+        minibatch = list(minibatch)
+        if not self.nesting_field.sequential:
+            return super(NestedField, self).pad(minibatch)
+        # Save values of attributes to be monkeypatched
+        old_pad_token = self.pad_token
+        old_init_token = self.init_token
+        old_eos_token = self.eos_token
+        old_fix_len = self.nesting_field.fix_length
+        # Monkeypatch the attributes
+        if self.nesting_field.fix_length is None:
+            max_len = max(len(xs) for ex in minibatch for xs in ex)
+            fix_len = max_len + 2 - (self.nesting_field.init_token,
+                                     self.nesting_field.eos_token).count(None)
+            self.nesting_field.fix_length = fix_len
+        self.pad_token = [self.pad_token] * self.nesting_field.fix_length
+        if self.init_token is not None:
+            # self.init_token = self.nesting_field.pad([[self.init_token]])[0]
+            self.init_token = [self.init_token]
+        if self.eos_token is not None:
+            # self.eos_token = self.nesting_field.pad([[self.eos_token]])[0]
+            self.eos_token = [self.eos_token]
+        # Do padding
+        old_include_lengths = self.include_lengths
+        self.include_lengths = True
+        self.nesting_field.include_lengths = True
+        padded, sentence_lengths = super(NestedField, self).pad(minibatch)
+        padded_with_lengths = [self.nesting_field.pad(ex) for ex in padded]
+        word_lengths = []
+        final_padded = []
+        max_sen_len = len(padded[0])
+        for (pad, lens), sentence_len in zip(padded_with_lengths, sentence_lengths):
+            if sentence_len == max_sen_len:
+                lens = lens
+                pad = pad
+            elif self.pad_first:
+                lens[:(max_sen_len - sentence_len)] = (
+                    [0] * (max_sen_len - sentence_len))
+                pad[:(max_sen_len - sentence_len)] = (
+                    [self.pad_token] * (max_sen_len - sentence_len))
+            else:
+                lens[-(max_sen_len - sentence_len):] = (
+                    [0] * (max_sen_len - sentence_len))
+                pad[-(max_sen_len - sentence_len):] = (
+                    [self.pad_token] * (max_sen_len - sentence_len))
+            word_lengths.append(lens)
+            final_padded.append(pad)
+        padded = final_padded
+        # Restore monkeypatched attributes
+        self.nesting_field.fix_length = old_fix_len
+        self.pad_token = old_pad_token
+        self.init_token = old_init_token
+        self.eos_token = old_eos_token
+        self.include_lengths = old_include_lengths
+        if self.include_lengths:
+            return padded, sentence_lengths, word_lengths
+        return padded
+    def build_vocab(self, *args, **kwargs):
+        """Construct the Vocab object for nesting field and combine it with this field's vocab.
+        Arguments:
+            Positional arguments: Dataset objects or other iterable data
+                sources from which to construct the Vocab object that
+                represents the set of possible values for the nesting field. If
+                a Dataset object is provided, all columns corresponding
+                to this field are used; individual columns can also be
+                provided directly.
+            Remaining keyword arguments: Passed to the constructor of Vocab.
+        """
+        sources = []
+        for arg in args:
+            sources.append(arg)
+        flattened = []
+        for source in sources:
+            flattened.extend(source)
+        old_vectors = None
+        old_unk_init = None
+        old_vectors_cache = None
+        if "vectors" in kwargs.keys():
+            old_vectors = kwargs["vectors"]
+            kwargs["vectors"] = None
+        if "unk_init" in kwargs.keys():
+            old_unk_init = kwargs["unk_init"]
+            kwargs["unk_init"] = None
+        if "vectors_cache" in kwargs.keys():
+            old_vectors_cache = kwargs["vectors_cache"]
+            kwargs["vectors_cache"] = None
+        # just build vocab and does not load vector
+        self.nesting_field.build_vocab(*flattened, **kwargs)
+        super(NestedField, self).build_vocab()
+        self.vocab.extend(self.nesting_field.vocab)
+        self.vocab.freqs = self.nesting_field.vocab.freqs.copy()
+        if old_vectors is not None:
+            self.vocab.load_vectors(old_vectors,
+                                    unk_init=old_unk_init, cache=old_vectors_cache)
+        self.nesting_field.vocab = self.vocab
+    def numericalize(self, arrs, device=None):
+        """Convert a padded minibatch into a variable tensor.
+        Each item in the minibatch will be numericalized independently and the resulting
+        tensors will be stacked at the first dimension.
+        Arguments:
+            arr (List[List[str]]): List of tokenized and padded examples.
+            device (str or torch.device): A string or instance of `torch.device`
+                specifying which device the Variables are going to be created on.
+                If left as default, the tensors will be created on cpu. Default: None.
+        """
+        numericalized = []
+        self.nesting_field.include_lengths = False
+        if self.include_lengths:
+            arrs, sentence_lengths, word_lengths = arrs
+        for arr in arrs:
+            numericalized_ex = self.nesting_field.numericalize(
+                arr, device=device)
+            numericalized.append(numericalized_ex)
+        padded_batch = torch.stack(numericalized)
+        self.nesting_field.include_lengths = True
+        if self.include_lengths:
+            sentence_lengths = \
+                torch.tensor(sentence_lengths, dtype=self.dtype, device=device)
+            word_lengths = torch.tensor(word_lengths, dtype=self.dtype, device=device)
+            return (padded_batch, sentence_lengths, word_lengths)
+        return padded_batch

data/field/mini_torchtext/pipeline.py ADDED Viewed

	@@ -0,0 +1,86 @@

+class Pipeline(object):
+    """Defines a pipeline for transforming sequence data.
+    The input is assumed to be utf-8 encoded `str` (Python 3) or
+    `unicode` (Python 2).
+    Attributes:
+        convert_token: The function to apply to input sequence data.
+        pipes: The Pipelines that will be applied to input sequence
+            data in order.
+    """
+    def __init__(self, convert_token=None):
+        """Create a pipeline.
+        Arguments:
+            convert_token: The function to apply to input sequence data.
+                If None, the identity function is used. Default: None
+        """
+        if convert_token is None:
+            self.convert_token = Pipeline.identity
+        elif callable(convert_token):
+            self.convert_token = convert_token
+        else:
+            raise ValueError("Pipeline input convert_token {} is not None "
+                             "or callable".format(convert_token))
+        self.pipes = [self]
+    def __call__(self, x, *args):
+        """Apply the the current Pipeline(s) to an input.
+        Arguments:
+            x: The input to process with the Pipeline(s).
+            Positional arguments: Forwarded to the `call` function
+                of the Pipeline(s).
+        """
+        for pipe in self.pipes:
+            x = pipe.call(x, *args)
+        return x
+    def call(self, x, *args):
+        """Apply _only_ the convert_token function of the current pipeline
+        to the input. If the input is a list, a list with the results of
+        applying the `convert_token` function to all input elements is
+        returned.
+        Arguments:
+            x: The input to apply the convert_token function to.
+            Positional arguments: Forwarded to the `convert_token` function
+                of the current Pipeline.
+        """
+        if isinstance(x, list):
+            return [self.convert_token(tok, *args) for tok in x]
+        return self.convert_token(x, *args)
+    def add_before(self, pipeline):
+        """Add a Pipeline to be applied before this processing pipeline.
+        Arguments:
+            pipeline: The Pipeline or callable to apply before this
+                Pipeline.
+        """
+        if not isinstance(pipeline, Pipeline):
+            pipeline = Pipeline(pipeline)
+        self.pipes = pipeline.pipes[:] + self.pipes[:]
+        return self
+    def add_after(self, pipeline):
+        """Add a Pipeline to be applied after this processing pipeline.
+        Arguments:
+            pipeline: The Pipeline or callable to apply after this
+                Pipeline.
+        """
+        if not isinstance(pipeline, Pipeline):
+            pipeline = Pipeline(pipeline)
+        self.pipes = self.pipes[:] + pipeline.pipes[:]
+        return self
+    @staticmethod
+    def identity(x):
+        """Return a copy of the input.
+        This is here for serialization compatibility with pickle.
+        """
+        return x

data/field/mini_torchtext/utils.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import random
+from contextlib import contextmanager
+from copy import deepcopy
+import re
+from functools import partial
+def _split_tokenizer(x):
+    return x.split()
+def _spacy_tokenize(x, spacy):
+    return [tok.text for tok in spacy.tokenizer(x)]
+_patterns = [r'\'',
+             r'\"',
+             r'\.',
+             r'<br \/>',
+             r',',
+             r'\(',
+             r'\)',
+             r'\!',
+             r'\?',
+             r'\;',
+             r'\:',
+             r'\s+']
+_replacements = [' \'  ',
+                 '',
+                 ' . ',
+                 ' ',
+                 ' , ',
+                 ' ( ',
+                 ' ) ',
+                 ' ! ',
+                 ' ? ',
+                 ' ',
+                 ' ',
+                 ' ']
+_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))
+def _basic_english_normalize(line):
+    r"""
+    Basic normalization for a line of text.
+    Normalization includes
+    - lowercasing
+    - complete some basic text normalization for English words as follows:
+        add spaces before and after '\''
+        remove '\"',
+        add spaces before and after '.'
+        replace '<br \/>'with single space
+        add spaces before and after ','
+        add spaces before and after '('
+        add spaces before and after ')'
+        add spaces before and after '!'
+        add spaces before and after '?'
+        replace ';' with single space
+        replace ':' with single space
+        replace multiple spaces with single space
+    Returns a list of tokens after splitting on whitespace.
+    """
+    line = line.lower()
+    for pattern_re, replaced_str in _patterns_dict:
+        line = pattern_re.sub(replaced_str, line)
+    return line.split()
+def get_tokenizer(tokenizer, language='en'):
+    r"""
+    Generate tokenizer function for a string sentence.
+    Arguments:
+        tokenizer: the name of tokenizer function. If None, it returns split()
+            function, which splits the string sentence by space.
+            If basic_english, it returns _basic_english_normalize() function,
+            which normalize the string first and split by space. If a callable
+            function, it will return the function. If a tokenizer library
+            (e.g. spacy, moses, toktok, revtok, subword), it returns the
+            corresponding library.
+        language: Default en
+    Examples:
+        >>> import torchtext
+        >>> from torchtext.data import get_tokenizer
+        >>> tokenizer = get_tokenizer("basic_english")
+        >>> tokens = tokenizer("You can now install TorchText using pip!")
+        >>> tokens
+        >>> ['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']
+    """
+    # default tokenizer is string.split(), added as a module function for serialization
+    if tokenizer is None:
+        return _split_tokenizer
+    if tokenizer == "basic_english":
+        if language != 'en':
+            raise ValueError("Basic normalization is only available for Enlish(en)")
+        return _basic_english_normalize
+    # simply return if a function is passed
+    if callable(tokenizer):
+        return tokenizer
+    if tokenizer == "spacy":
+        try:
+            import spacy
+            spacy = spacy.load(language)
+            return partial(_spacy_tokenize, spacy=spacy)
+        except ImportError:
+            print("Please install SpaCy. "
+                  "See the docs at https://spacy.io for more information.")
+            raise
+        except AttributeError:
+            print("Please install SpaCy and the SpaCy {} tokenizer. "
+                  "See the docs at https://spacy.io for more "
+                  "information.".format(language))
+            raise
+    elif tokenizer == "moses":
+        try:
+            from sacremoses import MosesTokenizer
+            moses_tokenizer = MosesTokenizer()
+            return moses_tokenizer.tokenize
+        except ImportError:
+            print("Please install SacreMoses. "
+                  "See the docs at https://github.com/alvations/sacremoses "
+                  "for more information.")
+            raise
+    elif tokenizer == "toktok":
+        try:
+            from nltk.tokenize.toktok import ToktokTokenizer
+            toktok = ToktokTokenizer()
+            return toktok.tokenize
+        except ImportError:
+            print("Please install NLTK. "
+                  "See the docs at https://nltk.org  for more information.")
+            raise
+    elif tokenizer == 'revtok':
+        try:
+            import revtok
+            return revtok.tokenize
+        except ImportError:
+            print("Please install revtok.")
+            raise
+    elif tokenizer == 'subword':
+        try:
+            import revtok
+            return partial(revtok.tokenize, decap=True)
+        except ImportError:
+            print("Please install revtok.")
+            raise
+    raise ValueError("Requested tokenizer {}, valid choices are a "
+                     "callable that takes a single string as input, "
+                     "\"revtok\" for the revtok reversible tokenizer, "
+                     "\"subword\" for the revtok caps-aware tokenizer, "
+                     "\"spacy\" for the SpaCy English tokenizer, or "
+                     "\"moses\" for the NLTK port of the Moses tokenization "
+                     "script.".format(tokenizer))
+def is_tokenizer_serializable(tokenizer, language):
+    """Extend with other tokenizers which are found to not be serializable
+    """
+    if tokenizer == 'spacy':
+        return False
+    return True
+def interleave_keys(a, b):
+    """Interleave bits from two sort keys to form a joint sort key.
+    Examples that are similar in both of the provided keys will have similar
+    values for the key defined by this function. Useful for tasks with two
+    text fields like machine translation or natural language inference.
+    """
+    def interleave(args):
+        return ''.join([x for t in zip(*args) for x in t])
+    return int(''.join(interleave(format(x, '016b') for x in (a, b))), base=2)
+def get_torch_version():
+    import torch
+    v = torch.__version__
+    version_substrings = v.split('.')
+    major, minor = version_substrings[0], version_substrings[1]
+    return int(major), int(minor)
+def dtype_to_attr(dtype):
+    # convert torch.dtype to dtype string id
+    # e.g. torch.int32 -> "int32"
+    # used for serialization
+    _, dtype = str(dtype).split('.')
+    return dtype
+# TODO: Write more tests!
+def ngrams_iterator(token_list, ngrams):
+    """Return an iterator that yields the given tokens and their ngrams.
+    Arguments:
+        token_list: A list of tokens
+        ngrams: the number of ngrams.
+    Examples:
+        >>> token_list = ['here', 'we', 'are']
+        >>> list(ngrams_iterator(token_list, 2))
+        >>> ['here', 'here we', 'we', 'we are', 'are']
+    """
+    def _get_ngrams(n):
+        return zip(*[token_list[i:] for i in range(n)])
+    for x in token_list:
+        yield x
+    for n in range(2, ngrams + 1):
+        for x in _get_ngrams(n):
+            yield ' '.join(x)
+class RandomShuffler(object):
+    """Use random functions while keeping track of the random state to make it
+    reproducible and deterministic."""
+    def __init__(self, random_state=None):
+        self._random_state = random_state
+        if self._random_state is None:
+            self._random_state = random.getstate()
+    @contextmanager
+    def use_internal_state(self):
+        """Use a specific RNG state."""
+        old_state = random.getstate()
+        random.setstate(self._random_state)
+        yield
+        self._random_state = random.getstate()
+        random.setstate(old_state)
+    @property
+    def random_state(self):
+        return deepcopy(self._random_state)
+    @random_state.setter
+    def random_state(self, s):
+        self._random_state = s
+    def __call__(self, data):
+        """Shuffle and return a new list."""
+        with self.use_internal_state():
+            return random.sample(data, len(data))

data/field/mini_torchtext/vocab.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from __future__ import unicode_literals
+from collections import defaultdict
+import logging
+logger = logging.getLogger(__name__)
+class Vocab(object):
+    """Defines a vocabulary object that will be used to numericalize a field.
+    Attributes:
+        freqs: A collections.Counter object holding the frequencies of tokens
+            in the data used to build the Vocab.
+        stoi: A collections.defaultdict instance mapping token strings to
+            numerical identifiers.
+        itos: A list of token strings indexed by their numerical identifiers.
+    """
+    # TODO (@mttk): Populate classs with default values of special symbols
+    UNK = '<unk>'
+    def __init__(self, counter, max_size=None, min_freq=1, specials=['<unk>', '<pad>'], specials_first=True):
+        """Create a Vocab object from a collections.Counter.
+        Arguments:
+            counter: collections.Counter object holding the frequencies of
+                each value found in the data.
+            max_size: The maximum size of the vocabulary, or None for no
+                maximum. Default: None.
+            min_freq: The minimum frequency needed to include a token in the
+                vocabulary. Values less than 1 will be set to 1. Default: 1.
+            specials: The list of special tokens (e.g., padding or eos) that
+                will be prepended to the vocabulary. Default: ['<unk'>, '<pad>']
+            specials_first: Whether to add special tokens into the vocabulary at first.
+                If it is False, they are added into the vocabulary at last.
+                Default: True.
+        """
+        self.freqs = counter
+        counter = counter.copy()
+        min_freq = max(min_freq, 1)
+        self.itos = list()
+        self.unk_index = None
+        if specials_first:
+            self.itos = list(specials)
+            # only extend max size if specials are prepended
+            max_size = None if max_size is None else max_size + len(specials)
+        # frequencies of special tokens are not counted when building vocabulary
+        # in frequency order
+        for tok in specials:
+            del counter[tok]
+        # sort by frequency, then alphabetically
+        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
+        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
+        for word, freq in words_and_frequencies:
+            if freq < min_freq or len(self.itos) == max_size:
+                break
+            self.itos.append(word)
+        if Vocab.UNK in specials:  # hard-coded for now
+            unk_index = specials.index(Vocab.UNK)  # position in list
+            # account for ordering of specials, set variable
+            self.unk_index = unk_index if specials_first else len(self.itos) + unk_index
+            self.stoi = defaultdict(self._default_unk_index)
+        else:
+            self.stoi = defaultdict()
+        if not specials_first:
+            self.itos.extend(list(specials))
+        # stoi is simply a reverse dict for itos
+        self.stoi.update({tok: i for i, tok in enumerate(self.itos)})
+    def _default_unk_index(self):
+        return self.unk_index
+    def __getitem__(self, token):
+        return self.stoi.get(token, self.stoi.get(Vocab.UNK))
+    def __getstate__(self):
+        # avoid picking defaultdict
+        attrs = dict(self.__dict__)
+        # cast to regular dict
+        attrs['stoi'] = dict(self.stoi)
+        return attrs
+    def __setstate__(self, state):
+        if state.get("unk_index", None) is None:
+            stoi = defaultdict()
+        else:
+            stoi = defaultdict(self._default_unk_index)
+        stoi.update(state['stoi'])
+        state['stoi'] = stoi
+        self.__dict__.update(state)
+    def __eq__(self, other):
+        if self.freqs != other.freqs:
+            return False
+        if self.stoi != other.stoi:
+            return False
+        if self.itos != other.itos:
+            return False
+        return True
+    def __len__(self):
+        return len(self.itos)
+    def extend(self, v, sort=False):
+        words = sorted(v.itos) if sort else v.itos
+        for w in words:
+            if w not in self.stoi:
+                self.itos.append(w)
+                self.stoi[w] = len(self.itos) - 1

data/field/nested_field.py ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.field.mini_torchtext.field import NestedField as TorchTextNestedField
+class NestedField(TorchTextNestedField):
+    def pad(self, example):
+        self.nesting_field.include_lengths = self.include_lengths
+        if not self.include_lengths:
+            return self.nesting_field.pad(example)
+        sentence_length = len(example)
+        example, word_lengths = self.nesting_field.pad(example)
+        return example, sentence_length, word_lengths
+    def numericalize(self, arr, device=None):
+        numericalized = []
+        self.nesting_field.include_lengths = False
+        if self.include_lengths:
+            arr, sentence_length, word_lengths = arr
+        numericalized = self.nesting_field.numericalize(arr, device=device)
+        self.nesting_field.include_lengths = True
+        if self.include_lengths:
+            sentence_length = torch.tensor(sentence_length, dtype=self.dtype, device=device)
+            word_lengths = torch.tensor(word_lengths, dtype=self.dtype, device=device)
+            return (numericalized, sentence_length, word_lengths)
+        return numericalized
+    def build_vocab(self, *args, **kwargs):
+        sources = []
+        for arg in args:
+            if isinstance(arg, torch.utils.data.Dataset):
+                sources += [arg.get_examples(name) for name, field in arg.fields.items() if field is self]
+            else:
+                sources.append(arg)
+        flattened = []
+        for source in sources:
+            flattened.extend(source)
+        # just build vocab and does not load vector
+        self.nesting_field.build_vocab(*flattened, **kwargs)
+        super(TorchTextNestedField, self).build_vocab()
+        self.vocab.extend(self.nesting_field.vocab)
+        self.vocab.freqs = self.nesting_field.vocab.freqs.copy()
+        self.nesting_field.vocab = self.vocab

data/parser/__init__.py ADDED Viewed

File without changes

data/parser/from_mrp/__init__.py ADDED Viewed

File without changes

data/parser/from_mrp/abstract_parser.py ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from data.parser.json_parser import example_from_json
+class AbstractParser(torch.utils.data.Dataset):
+    def __init__(self, fields, data, filter_pred=None):
+        super(AbstractParser, self).__init__()
+        self.examples = [example_from_json(d, fields) for _, d in sorted(data.items())]
+        if isinstance(fields, dict):
+            fields, field_dict = [], fields
+            for field in field_dict.values():
+                if isinstance(field, list):
+                    fields.extend(field)
+                else:
+                    fields.append(field)
+        if filter_pred is not None:
+            make_list = isinstance(self.examples, list)
+            self.examples = filter(filter_pred, self.examples)
+            if make_list:
+                self.examples = list(self.examples)
+        self.fields = dict(fields)
+        # Unpack field tuples
+        for n, f in list(self.fields.items()):
+            if isinstance(n, tuple):
+                self.fields.update(zip(n, f))
+                del self.fields[n]
+    def __getitem__(self, i):
+        item = self.examples[i]
+        processed_item = {}
+        for (name, field) in self.fields.items():
+            if field is not None:
+                processed_item[name] = field.process(getattr(item, name), device=None)
+        return processed_item
+    def __len__(self):
+        return len(self.examples)
+    def get_examples(self, attr):
+        if attr in self.fields:
+            for x in self.examples:
+                yield getattr(x, attr)

data/parser/from_mrp/evaluation_parser.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.from_mrp.abstract_parser import AbstractParser
+import utility.parser_utils as utils
+class EvaluationParser(AbstractParser):
+    def __init__(self, args, fields):
+        path = args.test_data
+        self.data = utils.load_dataset(path)
+        for sentence in self.data.values():
+            sentence["token anchors"] = [[a["from"], a["to"]] for a in sentence["token anchors"]]
+        utils.create_bert_tokens(self.data, args.encoder)
+        super(EvaluationParser, self).__init__(fields, self.data)

data/parser/from_mrp/labeled_edge_parser.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.from_mrp.abstract_parser import AbstractParser
+import utility.parser_utils as utils
+class LabeledEdgeParser(AbstractParser):
+    def __init__(self, args, part: str, fields, filter_pred=None, **kwargs):
+        assert part == "training" or part == "validation"
+        path = args.training_data if part == "training" else args.validation_data
+        self.data = utils.load_dataset(path)
+        utils.anchor_ids_from_intervals(self.data)
+        self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0
+        anchor_count, n_node_token_pairs = 0, 0
+        for sentence_id, sentence in list(self.data.items()):
+            for edge in sentence["edges"]:
+                if "label" not in edge:
+                    del self.data[sentence_id]
+                    break
+        for node, sentence in utils.node_generator(self.data):
+            node["label"] = "Node"
+            self.node_counter += 1
+        utils.create_bert_tokens(self.data, args.encoder)
+        # create edge vectors
+        for sentence in self.data.values():
+            assert sentence["tops"] == [0], sentence
+            N = len(sentence["nodes"])
+            edge_count = utils.create_edges(sentence)
+            self.edge_counter += edge_count
+            self.no_edge_counter += N * (N - 1) - edge_count
+            sentence["nodes"] = sentence["nodes"][1:]
+            N = len(sentence["nodes"])
+            sentence["anchor edges"] = [N, len(sentence["input"]), []]
+            sentence["source anchor edges"] = [N, len(sentence["input"]), []]  # dummy
+            sentence["target anchor edges"] = [N, len(sentence["input"]), []]  # dummy
+            sentence["anchored labels"] = [len(sentence["input"]), []]
+            for i, node in enumerate(sentence["nodes"]):
+                anchored_labels = []
+                for anchor in node["anchors"]:
+                    sentence["anchor edges"][-1].append((i, anchor))
+                    anchored_labels.append((anchor, node["label"]))
+                sentence["anchored labels"][1].append(anchored_labels)
+                anchor_count += len(node["anchors"])
+                n_node_token_pairs += len(sentence["input"])
+            sentence["id"] = [sentence["id"]]
+        self.anchor_freq = anchor_count / n_node_token_pairs
+        self.source_anchor_freq = self.target_anchor_freq = 0.5  # dummy
+        self.input_count = sum(len(sentence["input"]) for sentence in self.data.values())
+        super(LabeledEdgeParser, self).__init__(fields, self.data, filter_pred)
+    @staticmethod
+    def node_similarity_key(node):
+        return tuple([node["label"]] + node["anchors"])

data/parser/from_mrp/node_centric_parser.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.from_mrp.abstract_parser import AbstractParser
+import utility.parser_utils as utils
+class NodeCentricParser(AbstractParser):
+    def __init__(self, args, part: str, fields, filter_pred=None, **kwargs):
+        assert part == "training" or part == "validation"
+        path = args.training_data if part == "training" else args.validation_data
+        self.data = utils.load_dataset(path)
+        utils.anchor_ids_from_intervals(self.data)
+        self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0
+        anchor_count, n_node_token_pairs = 0, 0
+        for sentence_id, sentence in list(self.data.items()):
+            for node in sentence["nodes"]:
+                if "label" not in node:
+                    del self.data[sentence_id]
+                    break
+        for node, _ in utils.node_generator(self.data):
+            self.node_counter += 1
+        # print(f"Number of unlabeled nodes: {unlabeled_count}", flush=True)
+        utils.create_bert_tokens(self.data, args.encoder)
+        # create edge vectors
+        for sentence in self.data.values():
+            N = len(sentence["nodes"])
+            edge_count = utils.create_edges(sentence)
+            self.edge_counter += edge_count
+            # self.no_edge_counter += len([n for n in sentence["nodes"] if n["label"] in ["Source", "Target"]]) * len([n for n in sentence["nodes"] if n["label"] not in ["Source", "Target"]]) - edge_count
+            self.no_edge_counter += N * (N - 1) - edge_count
+            sentence["anchor edges"] = [N, len(sentence["input"]), []]
+            sentence["source anchor edges"] = [N, len(sentence["input"]), []]  # dummy
+            sentence["target anchor edges"] = [N, len(sentence["input"]), []]  # dummy
+            sentence["anchored labels"] = [len(sentence["input"]), []]
+            for i, node in enumerate(sentence["nodes"]):
+                anchored_labels = []
+                #if len(node["anchors"]) == 0:
+                #    print(f"Empty node in {sentence['id']}", flush=True)
+                for anchor in node["anchors"]:
+                    sentence["anchor edges"][-1].append((i, anchor))
+                    anchored_labels.append((anchor, node["label"]))
+                sentence["anchored labels"][1].append(anchored_labels)
+                anchor_count += len(node["anchors"])
+                n_node_token_pairs += len(sentence["input"])
+            sentence["id"] = [sentence["id"]]
+        self.anchor_freq = anchor_count / n_node_token_pairs
+        self.source_anchor_freq = self.target_anchor_freq = 0.5  # dummy
+        self.input_count = sum(len(sentence["input"]) for sentence in self.data.values())
+        super(NodeCentricParser, self).__init__(fields, self.data, filter_pred)
+    @staticmethod
+    def node_similarity_key(node):
+        return tuple([node["label"]] + node["anchors"])

data/parser/from_mrp/request_parser.py ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import utility.parser_utils as utils
+from data.parser.from_mrp.abstract_parser import AbstractParser
+class RequestParser(AbstractParser):
+    def __init__(self, sentences, args, fields):
+        self.data = {i: {"id": str(i), "sentence": sentence} for i, sentence in enumerate(sentences)}
+        sentences = [example["sentence"] for example in self.data.values()]
+        for example in self.data.values():
+            example["input"] = example["sentence"].strip().split(' ')
+            example["token anchors"], offset = [], 0
+            for token in example["input"]:
+                example["token anchors"].append([offset, offset + len(token)])
+                offset += len(token) + 1
+        utils.create_bert_tokens(self.data, args.encoder)
+        super(RequestParser, self).__init__(fields, self.data)

data/parser/from_mrp/sequential_parser.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.from_mrp.abstract_parser import AbstractParser
+import utility.parser_utils as utils
+class SequentialParser(AbstractParser):
+    def __init__(self, args, part: str, fields, filter_pred=None, **kwargs):
+        assert part == "training" or part == "validation"
+        path = args.training_data if part == "training" else args.validation_data
+        self.data = utils.load_dataset(path)
+        utils.anchor_ids_from_intervals(self.data)
+        self.node_counter, self.edge_counter, self.no_edge_counter = 0, 0, 0
+        anchor_count, source_anchor_count, target_anchor_count, n_node_token_pairs = 0, 0, 0, 0
+        for sentence_id, sentence in list(self.data.items()):
+            for node in sentence["nodes"]:
+                if "label" not in node:
+                    del self.data[sentence_id]
+                    break
+        for node, _ in utils.node_generator(self.data):
+            node["target anchors"] = []
+            node["source anchors"] = []
+        for sentence in self.data.values():
+            for e in sentence["edges"]:
+                source, target = e["source"], e["target"]
+                if sentence["nodes"][target]["label"] == "Target":
+                    sentence["nodes"][source]["target anchors"] += sentence["nodes"][target]["anchors"]
+                elif sentence["nodes"][target]["label"] == "Source":
+                    sentence["nodes"][source]["source anchors"] += sentence["nodes"][target]["anchors"]
+            for i, node in list(enumerate(sentence["nodes"]))[::-1]:
+                if "label" not in node or node["label"] in ["Source", "Target"]:
+                    del sentence["nodes"][i]
+            sentence["edges"] = []
+        for node, sentence in utils.node_generator(self.data):
+            self.node_counter += 1
+        utils.create_bert_tokens(self.data, args.encoder)
+        # create edge vectors
+        for sentence in self.data.values():
+            N = len(sentence["nodes"])
+            utils.create_edges(sentence)
+            self.no_edge_counter += N * (N - 1)
+            sentence["anchor edges"] = [N, len(sentence["input"]), []]
+            sentence["source anchor edges"] = [N, len(sentence["input"]), []]
+            sentence["target anchor edges"] = [N, len(sentence["input"]), []]
+            sentence["anchored labels"] = [len(sentence["input"]), []]
+            for i, node in enumerate(sentence["nodes"]):
+                anchored_labels = []
+                for anchor in node["anchors"]:
+                    sentence["anchor edges"][-1].append((i, anchor))
+                    anchored_labels.append((anchor, node["label"]))
+                for anchor in node["source anchors"]:
+                    sentence["source anchor edges"][-1].append((i, anchor))
+                for anchor in node["target anchors"]:
+                    sentence["target anchor edges"][-1].append((i, anchor))
+                sentence["anchored labels"][1].append(anchored_labels)
+                anchor_count += len(node["anchors"])
+                source_anchor_count += len(node["source anchors"])
+                target_anchor_count += len(node["target anchors"])
+                n_node_token_pairs += len(sentence["input"])
+            sentence["id"] = [sentence["id"]]
+        self.anchor_freq = anchor_count / n_node_token_pairs
+        self.source_anchor_freq = anchor_count / n_node_token_pairs
+        self.target_anchor_freq = anchor_count / n_node_token_pairs
+        self.input_count = sum(len(sentence["input"]) for sentence in self.data.values())
+        super(SequentialParser, self).__init__(fields, self.data, filter_pred)
+    @staticmethod
+    def node_similarity_key(node):
+        return tuple([node["label"]] + node["anchors"])

data/parser/json_parser.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from functools import reduce
+from data.field.mini_torchtext.example import Example
+def example_from_json(obj, fields):
+    ex = Example()
+    for key, vals in fields.items():
+        if vals is not None:
+            if not isinstance(vals, list):
+                vals = [vals]
+            for val in vals:
+                # for processing the key likes 'foo.bar'
+                name, field = val
+                ks = key.split(".")
+                def reducer(obj, key):
+                    if isinstance(obj, list):
+                        results = []
+                        for data in obj:
+                            if key not in data:
+                                # key error
+                                raise ValueError("Specified key {} was not found in " "the input data".format(key))
+                            else:
+                                results.append(data[key])
+                        return results
+                    else:
+                        # key error
+                        if key not in obj:
+                            raise ValueError("Specified key {} was not found in " "the input data".format(key))
+                        else:
+                            return obj[key]
+                v = reduce(reducer, ks, obj)
+                setattr(ex, name, field.preprocess(v))
+    return ex

data/parser/to_mrp/__init__.py ADDED Viewed

File without changes

data/parser/to_mrp/abstract_parser.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python3
+# coding=utf-8
+class AbstractParser:
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def create_nodes(self, prediction):
+        return [
+            {"id": i, "label": self.label_to_str(l, prediction["anchors"][i], prediction)}
+            for i, l in enumerate(prediction["labels"])
+        ]
+    def label_to_str(self, label, anchors, prediction):
+        return self.dataset.label_field.vocab.itos[label - 1]
+    def create_edges(self, prediction, nodes):
+        N = len(nodes)
+        node_sets = [{"id": n, "set": set([n])} for n in range(N)]
+        _, indices = prediction["edge presence"][:N, :N].reshape(-1).sort(descending=True)
+        sources, targets = indices // N, indices % N
+        edges = []
+        for i in range((N - 1) * N // 2):
+            source, target = sources[i].item(), targets[i].item()
+            p = prediction["edge presence"][source, target]
+            if p < 0.5 and len(edges) >= N - 1:
+                break
+            if node_sets[source]["set"] is node_sets[target]["set"] and p < 0.5:
+                continue
+            self.create_edge(source, target, prediction, edges, nodes)
+            if node_sets[source]["set"] is not node_sets[target]["set"]:
+                from_set = node_sets[source]["set"]
+                for n in node_sets[target]["set"]:
+                    from_set.add(n)
+                    node_sets[n]["set"] = from_set
+        return edges
+    def create_edge(self, source, target, prediction, edges, nodes):
+        label = self.get_edge_label(prediction, source, target)
+        edge = {"source": source, "target": target, "label": label}
+        edges.append(edge)
+    def create_anchors(self, prediction, nodes, join_contiguous=True, at_least_one=False, single_anchor=False, mode="anchors"):
+        for i, node in enumerate(nodes):
+            threshold = 0.5 if not at_least_one else min(0.5, prediction[mode][i].max().item())
+            node[mode] = (prediction[mode][i] >= threshold).nonzero(as_tuple=False).squeeze(-1)
+            node[mode] = prediction["token intervals"][node[mode], :]
+            if single_anchor and len(node[mode]) > 1:
+                start = min(a[0].item() for a in node[mode])
+                end = max(a[1].item() for a in node[mode])
+                node[mode] = [{"from": start, "to": end}]
+                continue
+            node[mode] = [{"from": f.item(), "to": t.item()} for f, t in node[mode]]
+            node[mode] = sorted(node[mode], key=lambda a: a["from"])
+            if join_contiguous and len(node[mode]) > 1:
+                cleaned_anchors = []
+                end, start = node[mode][0]["from"], node[mode][0]["from"]
+                for anchor in node[mode]:
+                    if end < anchor["from"]:
+                        cleaned_anchors.append({"from": start, "to": end})
+                        start = anchor["from"]
+                    end = anchor["to"]
+                cleaned_anchors.append({"from": start, "to": end})
+                node[mode] = cleaned_anchors
+        return nodes
+    def get_edge_label(self, prediction, source, target):
+        return self.dataset.edge_label_field.vocab.itos[prediction["edge labels"][source, target].item()]

data/parser/to_mrp/labeled_edge_parser.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.to_mrp.abstract_parser import AbstractParser
+class LabeledEdgeParser(AbstractParser):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.source_id = self.dataset.edge_label_field.vocab.stoi["Source"]
+        self.target_id = self.dataset.edge_label_field.vocab.stoi["Target"]
+    def parse(self, prediction):
+        output = {}
+        output["id"] = self.dataset.id_field.vocab.itos[prediction["id"].item()]
+        output["nodes"] = self.create_nodes(prediction)
+        output["nodes"] = self.create_anchors(prediction, output["nodes"], join_contiguous=True, at_least_one=True)
+        output["nodes"] = [{"id": 0}] + output["nodes"]
+        output["edges"] = self.create_edges(prediction, output["nodes"])
+        return output
+    def create_nodes(self, prediction):
+        return [{"id": i + 1} for i, l in enumerate(prediction["labels"])]
+    def create_edges(self, prediction, nodes):
+        N = len(nodes)
+        edge_prediction = prediction["edge presence"][:N, :N]
+        edges = []
+        for target in range(1, N):
+            if edge_prediction[0, target] >= 0.5:
+                prediction["edge labels"][0, target, self.source_id] = float("-inf")
+                prediction["edge labels"][0, target, self.target_id] = float("-inf")
+                self.create_edge(0, target, prediction, edges, nodes)
+        for source in range(1, N):
+            for target in range(1, N):
+                if source == target:
+                    continue
+                if edge_prediction[source, target] < 0.5:
+                    continue
+                for i in range(prediction["edge labels"].size(2)):
+                    if i not in [self.source_id, self.target_id]:
+                        prediction["edge labels"][source, target, i] = float("-inf")
+                self.create_edge(source, target, prediction, edges, nodes)
+        return edges
+    def get_edge_label(self, prediction, source, target):
+        return self.dataset.edge_label_field.vocab.itos[prediction["edge labels"][source, target].argmax(-1).item()]

data/parser/to_mrp/node_centric_parser.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.to_mrp.abstract_parser import AbstractParser
+class NodeCentricParser(AbstractParser):
+    def parse(self, prediction):
+        output = {}
+        output["id"] = self.dataset.id_field.vocab.itos[prediction["id"].item()]
+        output["nodes"] = self.create_nodes(prediction)
+        output["nodes"] = self.create_anchors(prediction, output["nodes"], join_contiguous=True, at_least_one=True)
+        output["edges"] = self.create_edges(prediction, output["nodes"])
+        return output
+    def create_edge(self, source, target, prediction, edges, nodes):
+        edge = {"source": source, "target": target, "label": None}
+        edges.append(edge)
+    def create_edges(self, prediction, nodes):
+        N = len(nodes)
+        edge_prediction = prediction["edge presence"][:N, :N]
+        targets = [i for i, node in enumerate(nodes) if node["label"] in ["Source", "Target"]]
+        sources = [i for i, node in enumerate(nodes) if node["label"] not in ["Source", "Target"]]
+        edges = []
+        for target in targets:
+            for source in sources:
+                if edge_prediction[source, target] >= 0.5:
+                    self.create_edge(source, target, prediction, edges, nodes)
+        return edges

data/parser/to_mrp/sequential_parser.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env python3
+# coding=utf-8
+from data.parser.to_mrp.abstract_parser import AbstractParser
+class SequentialParser(AbstractParser):
+    def parse(self, prediction):
+        output = {}
+        output["id"] = self.dataset.id_field.vocab.itos[prediction["id"].item()]
+        output["nodes"] = self.create_nodes(prediction)
+        output["nodes"] = self.create_anchors(prediction, output["nodes"], join_contiguous=True, at_least_one=True, mode="anchors")
+        output["nodes"] = self.create_anchors(prediction, output["nodes"], join_contiguous=True, at_least_one=False, mode="source anchors")
+        output["nodes"] = self.create_anchors(prediction, output["nodes"], join_contiguous=True, at_least_one=False, mode="target anchors")
+        output["edges"], output["nodes"] = self.create_targets_sources(output["nodes"])
+        return output
+    def create_targets_sources(self, nodes):
+        edges, new_nodes = [], []
+        for i, node in enumerate(nodes):
+            new_node_id = len(nodes) + len(new_nodes)
+            if len(node["source anchors"]) > 0:
+                new_nodes.append({"id": new_node_id, "label": "Source", "anchors": node["source anchors"]})
+                edges.append({"source": i, "target": new_node_id, "label": ""})
+                new_node_id += 1
+            del node["source anchors"]
+            if len(node["target anchors"]) > 0:
+                new_nodes.append({"id": new_node_id, "label": "Target", "anchors": node["target anchors"]})
+                edges.append({"source": i, "target": new_node_id, "label": ""})
+            del node["target anchors"]
+        return edges, nodes + new_nodes

model/__init__.py ADDED Viewed

File without changes

model/head/__init__.py ADDED Viewed

File without changes

model/head/abstract_head.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.module.edge_classifier import EdgeClassifier
+from model.module.anchor_classifier import AnchorClassifier
+from utility.cross_entropy import cross_entropy, binary_cross_entropy
+from utility.hungarian_matching import get_matching, reorder, match_anchor, match_label
+from utility.utils import create_padding_mask
+class AbstractHead(nn.Module):
+    def __init__(self, dataset, args, config, initialize: bool):
+        super(AbstractHead, self).__init__()
+        self.edge_classifier = self.init_edge_classifier(dataset, args, config, initialize)
+        self.label_classifier = self.init_label_classifier(dataset, args, config, initialize)
+        self.anchor_classifier = self.init_anchor_classifier(dataset, args, config, initialize, mode="anchor")
+        self.source_anchor_classifier = self.init_anchor_classifier(dataset, args, config, initialize, mode="source_anchor")
+        self.target_anchor_classifier = self.init_anchor_classifier(dataset, args, config, initialize, mode="target_anchor")
+        self.query_length = args.query_length
+        self.focal = args.focal
+        self.dataset = dataset
+    def forward(self, encoder_output, decoder_output, encoder_mask, decoder_mask, batch):
+        output = {}
+        decoder_lens = self.query_length * batch["every_input"][1]
+        output["label"] = self.forward_label(decoder_output)
+        output["anchor"] = self.forward_anchor(decoder_output, encoder_output, encoder_mask, mode="anchor")  # shape: (B, T_l, T_w)
+        output["source_anchor"] = self.forward_anchor(decoder_output, encoder_output, encoder_mask, mode="source_anchor")  # shape: (B, T_l, T_w)
+        output["target_anchor"] = self.forward_anchor(decoder_output, encoder_output, encoder_mask, mode="target_anchor")  # shape: (B, T_l, T_w)
+        cost_matrices = self.create_cost_matrices(output, batch, decoder_lens)
+        matching = get_matching(cost_matrices)
+        decoder_output = reorder(decoder_output, matching, batch["labels"][0].size(1))
+        output["edge presence"], output["edge label"] = self.forward_edge(decoder_output)
+        return self.loss(output, batch, matching, decoder_mask)
+    def predict(self, encoder_output, decoder_output, encoder_mask, decoder_mask, batch, **kwargs):
+        every_input, word_lens = batch["every_input"]
+        decoder_lens = self.query_length * word_lens
+        batch_size = every_input.size(0)
+        label_pred = self.forward_label(decoder_output)
+        anchor_pred = self.forward_anchor(decoder_output, encoder_output, encoder_mask, mode="anchor")  # shape: (B, T_l, T_w)
+        source_anchor_pred = self.forward_anchor(decoder_output, encoder_output, encoder_mask, mode="source_anchor")  # shape: (B, T_l, T_w)
+        target_anchor_pred = self.forward_anchor(decoder_output, encoder_output, encoder_mask, mode="target_anchor")  # shape: (B, T_l, T_w)
+        labels = [[] for _ in range(batch_size)]
+        anchors, source_anchors, target_anchors = [[] for _ in range(batch_size)], [[] for _ in range(batch_size)], [[] for _ in range(batch_size)]
+        for b in range(batch_size):
+            label_indices = self.inference_label(label_pred[b, :decoder_lens[b], :]).cpu()
+            for t in range(label_indices.size(0)):
+                label_index = label_indices[t].item()
+                if label_index == 0:
+                    continue
+                decoder_output[b, len(labels[b]), :] = decoder_output[b, t, :]
+                labels[b].append(label_index)
+                if anchor_pred is None:
+                    anchors[b].append(list(range(t // self.query_length, word_lens[b])))
+                else:
+                    anchors[b].append(self.inference_anchor(anchor_pred[b, t, :word_lens[b]]).cpu())
+                if source_anchor_pred is None:
+                    source_anchors[b].append(list(range(t // self.query_length, word_lens[b])))
+                else:
+                    source_anchors[b].append(self.inference_anchor(source_anchor_pred[b, t, :word_lens[b]]).cpu())
+                if target_anchor_pred is None:
+                    target_anchors[b].append(list(range(t // self.query_length, word_lens[b])))
+                else:
+                    target_anchors[b].append(self.inference_anchor(target_anchor_pred[b, t, :word_lens[b]]).cpu())
+        decoder_output = decoder_output[:, : max(len(l) for l in labels), :]
+        edge_presence, edge_labels = self.forward_edge(decoder_output)
+        outputs = [
+            self.parser.parse(
+                {
+                    "labels": labels[b],
+                    "anchors": anchors[b],
+                    "source anchors": source_anchors[b],
+                    "target anchors": target_anchors[b],
+                    "edge presence": self.inference_edge_presence(edge_presence, b),
+                    "edge labels": self.inference_edge_label(edge_labels, b),
+                    "id": batch["id"][b].cpu(),
+                    "tokens": batch["every_input"][0][b, : word_lens[b]].cpu(),
+                    "token intervals": batch["token_intervals"][b, :, :].cpu(),
+                },
+                **kwargs
+            )
+            for b in range(batch_size)
+        ]
+        return outputs
+    def loss(self, output, batch, matching, decoder_mask):
+        batch_size = batch["every_input"][0].size(0)
+        device = batch["every_input"][0].device
+        T_label = batch["labels"][0].size(1)
+        T_input = batch["every_input"][0].size(1)
+        T_edge = batch["edge_presence"].size(1)
+        input_mask = create_padding_mask(batch_size, T_input, batch["every_input"][1], device)  # shape: (B, T_input)
+        label_mask = create_padding_mask(batch_size, T_label, batch["labels"][1], device)  # shape: (B, T_label)
+        edge_mask = torch.eye(T_label, T_label, device=device, dtype=torch.bool).unsqueeze(0)  # shape: (1, T_label, T_label)
+        edge_mask = edge_mask | label_mask.unsqueeze(1) | label_mask.unsqueeze(2)  # shape: (B, T_label, T_label)
+        if T_edge != T_label:
+            edge_mask = F.pad(edge_mask, (T_edge - T_label, 0, T_edge - T_label, 0), value=0)
+        edge_label_mask = (batch["edge_presence"] == 0) | edge_mask
+        if output["edge label"] is not None:
+            batch["edge_labels"] = (
+                batch["edge_labels"][0][:, :, :, :output["edge label"].size(-1)],
+                batch["edge_labels"][1],
+            )
+        losses = {}
+        losses.update(self.loss_label(output, batch, decoder_mask, matching))
+        losses.update(self.loss_anchor(output, batch, input_mask, matching, mode="anchor"))
+        losses.update(self.loss_anchor(output, batch, input_mask, matching, mode="source_anchor"))
+        losses.update(self.loss_anchor(output, batch, input_mask, matching, mode="target_anchor"))
+        losses.update(self.loss_edge_presence(output, batch, edge_mask))
+        losses.update(self.loss_edge_label(output, batch, edge_label_mask.unsqueeze(-1)))
+        stats = {f"{key}": value.detach().cpu().item() for key, value in losses.items()}
+        total_loss = sum(losses.values()) / len(losses)
+        return total_loss, stats
+    @torch.no_grad()
+    def create_cost_matrices(self, output, batch, decoder_lens):
+        batch_size = len(batch["labels"][1])
+        decoder_lens = decoder_lens.cpu()
+        matrices = []
+        for b in range(batch_size):
+            label_cost_matrix = self.label_cost_matrix(output, batch, decoder_lens, b)
+            anchor_cost_matrix = self.anchor_cost_matrix(output, batch, decoder_lens, b)
+            cost_matrix = label_cost_matrix * anchor_cost_matrix
+            matrices.append(cost_matrix.cpu())
+        return matrices
+    def init_edge_classifier(self, dataset, args, config, initialize: bool):
+        if not config["edge presence"] and not config["edge label"]:
+            return None
+        return EdgeClassifier(dataset, args, initialize, presence=config["edge presence"], label=config["edge label"])
+    def init_label_classifier(self, dataset, args, config, initialize: bool):
+        if not config["label"]:
+            return None
+        classifier = nn.Sequential(
+            nn.Dropout(args.dropout_label),
+            nn.Linear(args.hidden_size, len(dataset.label_field.vocab) + 1, bias=True)
+        )
+        if initialize:
+            classifier[1].bias.data = dataset.label_freqs.log()
+        return classifier
+    def init_anchor_classifier(self, dataset, args, config, initialize: bool, mode="anchor"):
+        if not config[mode]:
+            return None
+        return AnchorClassifier(dataset, args, initialize, mode=mode)
+    def forward_edge(self, decoder_output):
+        if self.edge_classifier is None:
+            return None, None
+        return self.edge_classifier(decoder_output)
+    def forward_label(self, decoder_output):
+        if self.label_classifier is None:
+            return None
+        return torch.log_softmax(self.label_classifier(decoder_output), dim=-1)
+    def forward_anchor(self, decoder_output, encoder_output, encoder_mask, mode="anchor"):
+        classifier = getattr(self, f"{mode}_classifier")
+        if classifier is None:
+            return None
+        return classifier(decoder_output, encoder_output, encoder_mask)
+    def inference_label(self, prediction):
+        prediction = prediction.exp()
+        return torch.where(
+            prediction[:, 0] > prediction[:, 1:].sum(-1),
+            torch.zeros(prediction.size(0), dtype=torch.long, device=prediction.device),
+            prediction[:, 1:].argmax(dim=-1) + 1
+        )
+    def inference_anchor(self, prediction):
+        return prediction.sigmoid()
+    def inference_edge_presence(self, prediction, example_index: int):
+        if prediction is None:
+            return None
+        N = prediction.size(1)
+        mask = torch.eye(N, N, device=prediction.device, dtype=torch.bool)
+        return prediction[example_index, :, :].sigmoid().masked_fill(mask, 0.0).cpu()
+    def inference_edge_label(self, prediction, example_index: int):
+        if prediction is None:
+            return None
+        return prediction[example_index, :, :, :].cpu()
+    def loss_edge_presence(self, prediction, target, mask):
+        if self.edge_classifier is None or prediction["edge presence"] is None:
+            return {}
+        return {"edge presence": binary_cross_entropy(prediction["edge presence"], target["edge_presence"].float(), mask)}
+    def loss_edge_label(self, prediction, target, mask):
+        if self.edge_classifier is None or prediction["edge label"] is None:
+            return {}
+        return {"edge label": binary_cross_entropy(prediction["edge label"], target["edge_labels"][0].float(), mask)}
+    def loss_label(self, prediction, target, mask, matching):
+        if self.label_classifier is None or prediction["label"] is None:
+            return {}
+        prediction = prediction["label"]
+        target = match_label(
+            target["labels"][0], matching, prediction.shape[:-1], prediction.device, self.query_length
+        )
+        return {"label": cross_entropy(prediction, target, mask, focal=self.focal)}
+    def loss_anchor(self, prediction, target, mask, matching, mode="anchor"):
+        if getattr(self, f"{mode}_classifier") is None or prediction[mode] is None:
+            return {}
+        prediction = prediction[mode]
+        target, anchor_mask = match_anchor(target[mode], matching, prediction.shape, prediction.device)
+        mask = anchor_mask.unsqueeze(-1) | mask.unsqueeze(-2)
+        return {mode: binary_cross_entropy(prediction, target.float(), mask)}
+    def label_cost_matrix(self, output, batch, decoder_lens, b: int):
+        if output["label"] is None:
+            return 1.0
+        target_labels = batch["anchored_labels"][b]  # shape: (num_nodes, num_inputs, num_classes)
+        label_prob = output["label"][b, : decoder_lens[b], :].exp().unsqueeze(0)  # shape: (1, num_queries, num_classes)
+        tgt_label = target_labels.repeat_interleave(self.query_length, dim=1)  # shape: (num_nodes, num_queries, num_classes)
+        cost_matrix = ((tgt_label * label_prob).sum(-1) * label_prob[:, :, 1:].sum(-1)).t().sqrt()  # shape: (num_queries, num_nodes)
+        return cost_matrix
+    def anchor_cost_matrix(self, output, batch, decoder_lens, b: int):
+        if output["anchor"] is None:
+            return 1.0
+        num_nodes = batch["labels"][1][b]
+        word_lens = batch["every_input"][1]
+        target_anchors, _ = batch["anchor"]
+        pred_anchors = output["anchor"].sigmoid()
+        tgt_align = target_anchors[b, : num_nodes, : word_lens[b]]  # shape: (num_nodes, num_inputs)
+        align_prob = pred_anchors[b, : decoder_lens[b], : word_lens[b]]  # shape: (num_queries, num_inputs)
+        align_prob = align_prob.unsqueeze(1).expand(-1, num_nodes, -1)  # shape: (num_queries, num_nodes, num_inputs)
+        align_prob = torch.where(tgt_align.unsqueeze(0).bool(), align_prob, 1.0 - align_prob)  # shape: (num_queries, num_nodes, num_inputs)
+        cost_matrix = align_prob.log().mean(-1).exp()  # shape: (num_queries, num_nodes)
+        return cost_matrix

model/head/labeled_edge_head.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+import torch.nn as nn
+from model.head.abstract_head import AbstractHead
+from data.parser.to_mrp.labeled_edge_parser import LabeledEdgeParser
+from utility.cross_entropy import binary_cross_entropy
+from utility.hungarian_matching import match_label
+class LabeledEdgeHead(AbstractHead):
+    def __init__(self, dataset, args, initialize):
+        config = {
+            "label": True,
+            "edge presence": True,
+            "edge label": True,
+            "anchor": True,
+            "source_anchor": False,
+            "target_anchor": False
+        }
+        super(LabeledEdgeHead, self).__init__(dataset, args, config, initialize)
+        self.top_node = nn.Parameter(torch.randn(1, 1, args.hidden_size), requires_grad=True)
+        self.parser = LabeledEdgeParser(dataset)
+    def init_label_classifier(self, dataset, args, config, initialize: bool):
+        classifier = nn.Sequential(
+            nn.Dropout(args.dropout_label),
+            nn.Linear(args.hidden_size, 1, bias=True)
+        )
+        if initialize:
+            bias_init = torch.tensor([dataset.label_freqs[1]])
+            classifier[1].bias.data = (bias_init / (1.0 - bias_init)).log()
+        return classifier
+    def forward_label(self, decoder_output):
+        return self.label_classifier(decoder_output)
+    def forward_edge(self, decoder_output):
+        top_node = self.top_node.expand(decoder_output.size(0), -1, -1)
+        decoder_output = torch.cat([top_node, decoder_output], dim=1)
+        return self.edge_classifier(decoder_output)
+    def loss_label(self, prediction, target, mask, matching):
+        prediction = prediction["label"]
+        target = match_label(
+            target["labels"][0], matching, prediction.shape[:-1], prediction.device, self.query_length
+        )
+        return {"label": binary_cross_entropy(prediction.squeeze(-1), target.float(), mask, focal=self.focal)}
+    def inference_label(self, prediction):
+        return (prediction.squeeze(-1) > 0.0).long()
+    def label_cost_matrix(self, output, batch, decoder_lens, b: int):
+        if output["label"] is None:
+            return 1.0
+        target_labels = batch["anchored_labels"][b]  # shape: (num_nodes, num_inputs, 2)
+        label_prob = output["label"][b, : decoder_lens[b], :].sigmoid().unsqueeze(0)  # shape: (1, num_queries, 1)
+        label_prob = torch.cat([1.0 - label_prob, label_prob], dim=-1)  # shape: (1, num_queries, 2)
+        tgt_label = target_labels.repeat_interleave(self.query_length, dim=1)  # shape: (num_nodes, num_queries, 2)
+        cost_matrix = ((tgt_label * label_prob).sum(-1) * label_prob[:, :, 1:].sum(-1)).t().sqrt()  # shape: (num_queries, num_nodes)
+        return cost_matrix

model/head/node_centric_head.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+from model.head.abstract_head import AbstractHead
+from data.parser.to_mrp.node_centric_parser import NodeCentricParser
+from utility.cross_entropy import binary_cross_entropy
+class NodeCentricHead(AbstractHead):
+    def __init__(self, dataset, args, initialize):
+        config = {
+            "label": True,
+            "edge presence": True,
+            "edge label": False,
+            "anchor": True,
+            "source_anchor": False,
+            "target_anchor": False
+        }
+        super(NodeCentricHead, self).__init__(dataset, args, config, initialize)
+        self.source_id = dataset.label_field.vocab.stoi["Source"] + 1
+        self.target_id = dataset.label_field.vocab.stoi["Target"] + 1
+        self.parser = NodeCentricParser(dataset)

model/head/sequential_head.py ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.head.abstract_head import AbstractHead
+from data.parser.to_mrp.sequential_parser import SequentialParser
+from utility.cross_entropy import cross_entropy
+class SequentialHead(AbstractHead):
+    def __init__(self, dataset, args, initialize):
+        config = {
+            "label": True,
+            "edge presence": False,
+            "edge label": False,
+            "anchor": True,
+            "source_anchor": True,
+            "target_anchor": True
+        }
+        super(SequentialHead, self).__init__(dataset, args, config, initialize)
+        self.parser = SequentialParser(dataset)

model/model.py ADDED Viewed

	@@ -0,0 +1,82 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+import torch.nn as nn
+from model.module.encoder import Encoder
+from model.module.transformer import Decoder
+from model.head.node_centric_head import NodeCentricHead
+from model.head.labeled_edge_head import LabeledEdgeHead
+from model.head.sequential_head import SequentialHead
+from utility.utils import create_padding_mask
+class Model(nn.Module):
+    def __init__(self, dataset, args, initialize=True):
+        super(Model, self).__init__()
+        self.encoder = Encoder(args, dataset)
+        if args.n_layers > 0:
+            self.decoder = Decoder(args)
+        else:
+            self.decoder = lambda x, *args: x  # identity function, which ignores all arguments except the first one
+        if args.graph_mode == "sequential":
+            self.head = SequentialHead(dataset, args, initialize)
+        elif args.graph_mode == "node-centric":
+            self.head = NodeCentricHead(dataset, args, initialize)
+        elif args.graph_mode == "labeled-edge":
+            self.head = LabeledEdgeHead(dataset, args, initialize)
+        self.query_length = args.query_length
+        self.dataset = dataset
+        self.args = args
+    def forward(self, batch, inference=False, **kwargs):
+        every_input, word_lens = batch["every_input"]
+        decoder_lens = self.query_length * word_lens
+        batch_size, input_len = every_input.size(0), every_input.size(1)
+        device = every_input.device
+        encoder_mask = create_padding_mask(batch_size, input_len, word_lens, device)
+        decoder_mask = create_padding_mask(batch_size, self.query_length * input_len, decoder_lens, device)
+        encoder_output, decoder_input = self.encoder(batch["input"], batch["char_form_input"], batch["input_scatter"], input_len)
+        decoder_output = self.decoder(decoder_input, encoder_output, decoder_mask, encoder_mask)
+        if inference:
+            return self.head.predict(encoder_output, decoder_output, encoder_mask, decoder_mask, batch)
+        else:
+            return self.head(encoder_output, decoder_output, encoder_mask, decoder_mask, batch)
+    def get_params_for_optimizer(self, args):
+        encoder_decay, encoder_no_decay = self.get_encoder_parameters(args.n_encoder_layers)
+        decoder_decay, decoder_no_decay = self.get_decoder_parameters()
+        parameters = [{"params": p, "weight_decay": args.encoder_weight_decay} for p in encoder_decay]
+        parameters += [{"params": p, "weight_decay": 0.0} for p in encoder_no_decay]
+        parameters += [
+            {"params": decoder_decay, "weight_decay": args.decoder_weight_decay},
+            {"params": decoder_no_decay, "weight_decay": 0.0},
+        ]
+        return parameters
+    def get_decoder_parameters(self):
+        no_decay = ["bias", "LayerNorm.weight", "_norm.weight"]
+        decay_params = (p for name, p in self.named_parameters() if not any(nd in name for nd in no_decay) and not name.startswith("encoder.bert") and p.requires_grad)
+        no_decay_params = (p for name, p in self.named_parameters() if any(nd in name for nd in no_decay) and not name.startswith("encoder.bert") and p.requires_grad)
+        return decay_params, no_decay_params
+    def get_encoder_parameters(self, n_layers):
+        no_decay = ["bias", "LayerNorm.weight", "_norm.weight"]
+        decay_params = [
+            [p for name, p in self.named_parameters() if not any(nd in name for nd in no_decay) and name.startswith(f"encoder.bert.encoder.layer.{n_layers - 1 - i}.") and p.requires_grad] for i in range(n_layers)
+        ]
+        no_decay_params = [
+            [p for name, p in self.named_parameters() if any(nd in name for nd in no_decay) and name.startswith(f"encoder.bert.encoder.layer.{n_layers - 1 - i}.") and p.requires_grad] for i in range(n_layers)
+        ]
+        return decay_params, no_decay_params

model/module/__init__.py ADDED Viewed

File without changes

model/module/anchor_classifier.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.module.biaffine import Biaffine
+class AnchorClassifier(nn.Module):
+    def __init__(self, dataset, args, initialize: bool, bias=True, mode="anchor"):
+        super(AnchorClassifier, self).__init__()
+        self.token_f = nn.Linear(args.hidden_size, args.hidden_size_anchor)
+        self.label_f = nn.Linear(args.hidden_size, args.hidden_size_anchor)
+        self.dropout = nn.Dropout(args.dropout_anchor)
+        if bias and initialize:
+            bias_init = torch.tensor([getattr(dataset, f"{mode}_freq")])
+            bias_init = (bias_init / (1.0 - bias_init)).log()
+        else:
+            bias_init = None
+        self.output = Biaffine(args.hidden_size_anchor, 1, bias=bias, bias_init=bias_init)
+    def forward(self, label, tokens, encoder_mask):
+        tokens = self.dropout(F.elu(self.token_f(tokens)))  # shape: (B, T_w, H)
+        label = self.dropout(F.elu(self.label_f(label)))  # shape: (B, T_l, H)
+        anchor = self.output(label, tokens).squeeze(-1)  # shape: (B, T_l, T_w)
+        anchor = anchor.masked_fill(encoder_mask.unsqueeze(1), float("-inf"))  # shape: (B, T_l, T_w)
+        return anchor

model/module/biaffine.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch.nn as nn
+from model.module.bilinear import Bilinear
+class Biaffine(nn.Module):
+    def __init__(self, input_dim, output_dim, bias=True, bias_init=None):
+        super(Biaffine, self).__init__()
+        self.linear_1 = nn.Linear(input_dim, output_dim, bias=False)
+        self.linear_2 = nn.Linear(input_dim, output_dim, bias=False)
+        self.bilinear = Bilinear(input_dim, input_dim, output_dim, bias=bias)
+        if bias_init is not None:
+            self.bilinear.bias.data = bias_init
+    def forward(self, x, y):
+        return self.bilinear(x, y) + self.linear_1(x).unsqueeze(2) + self.linear_2(y).unsqueeze(1)

model/module/bilinear.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# from https://github.com/NLPInBLCU/BiaffineDependencyParsing/blob/master/modules/biaffine.py
+import torch
+import torch.nn as nn
+class Bilinear(nn.Module):
+    """
+    使用版本
+    A bilinear module that deals with broadcasting for efficient memory usage.
+    Input: tensors of sizes (N x L1 x D1) and (N x L2 x D2)
+    Output: tensor of size (N x L1 x L2 x O)"""
+    def __init__(self, input1_size, input2_size, output_size, bias=True):
+        super(Bilinear, self).__init__()
+        self.input1_size = input1_size
+        self.input2_size = input2_size
+        self.output_size = output_size
+        self.weight = nn.Parameter(torch.Tensor(input1_size, input2_size, output_size))
+        self.bias = nn.Parameter(torch.Tensor(output_size)) if bias else None
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.zeros_(self.weight)
+    def forward(self, input1, input2):
+        input1_size = list(input1.size())
+        input2_size = list(input2.size())
+        intermediate = torch.mm(input1.view(-1, input1_size[-1]), self.weight.view(-1, self.input2_size * self.output_size),)
+        input2 = input2.transpose(1, 2)
+        output = intermediate.view(input1_size[0], input1_size[1] * self.output_size, input2_size[2]).bmm(input2)
+        output = output.view(input1_size[0], input1_size[1], self.output_size, input2_size[1]).transpose(2, 3)
+        if self.bias is not None:
+            output = output + self.bias
+        return output

model/module/char_embedding.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
+class CharEmbedding(nn.Module):
+    def __init__(self, vocab_size: int, embedding_size: int, output_size: int):
+        super(CharEmbedding, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_size, sparse=False)
+        self.layer_norm = nn.LayerNorm(embedding_size)
+        self.gru = nn.GRU(embedding_size, embedding_size, num_layers=1, bidirectional=True)
+        self.out_linear = nn.Linear(2*embedding_size, output_size)
+        self.layer_norm_2 = nn.LayerNorm(output_size)
+    def forward(self, words, sentence_lens, word_lens):
+        # input shape: (B, W, C)
+        n_words = words.size(1)
+        sentence_lens = sentence_lens.cpu()
+        sentence_packed = pack_padded_sequence(words, sentence_lens, batch_first=True)  # shape: (B*W, C)
+        lens_packed = pack_padded_sequence(word_lens, sentence_lens, batch_first=True)  # shape: (B*W)
+        word_packed = pack_padded_sequence(sentence_packed.data, lens_packed.data.cpu(), batch_first=True, enforce_sorted=False)  # shape: (B*W*C)
+        embedded = self.embedding(word_packed.data)  # shape: (B*W*C, D)
+        embedded = self.layer_norm(embedded)  # shape: (B*W*C, D)
+        embedded_packed = PackedSequence(embedded, word_packed[1], word_packed[2], word_packed[3])
+        _, embedded = self.gru(embedded_packed)  # shape: (layers * 2, B*W, D)
+        embedded = embedded[-2:, :, :].transpose(0, 1).flatten(1, 2)  # shape: (B*W, 2*D)
+        embedded = F.relu(embedded)
+        embedded = self.out_linear(embedded)
+        embedded = self.layer_norm_2(embedded)
+        embedded, _ = pad_packed_sequence(
+            PackedSequence(embedded, sentence_packed[1], sentence_packed[2], sentence_packed[3]), batch_first=True, total_length=n_words,
+        )  # shape: (B, W, 2*D)
+        return embedded  # shape: (B, W, 2*D)

model/module/edge_classifier.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.module.biaffine import Biaffine
+class EdgeClassifier(nn.Module):
+    def __init__(self, dataset, args, initialize: bool, presence: bool, label: bool):
+        super(EdgeClassifier, self).__init__()
+        self.presence = presence
+        if self.presence:
+            if initialize:
+                presence_init = torch.tensor([dataset.edge_presence_freq])
+                presence_init = (presence_init / (1.0 - presence_init)).log()
+            else:
+                presence_init = None
+            self.edge_presence = EdgeBiaffine(
+                args.hidden_size, args.hidden_size_edge_presence, 1, args.dropout_edge_presence, bias_init=presence_init
+            )
+        self.label = label
+        if self.label:
+            label_init = (dataset.edge_label_freqs / (1.0 - dataset.edge_label_freqs)).log() if initialize else None
+            n_labels = len(dataset.edge_label_field.vocab)
+            self.edge_label = EdgeBiaffine(
+                args.hidden_size, args.hidden_size_edge_label, n_labels, args.dropout_edge_label, bias_init=label_init
+            )
+    def forward(self, x):
+        presence, label = None, None
+        if self.presence:
+            presence = self.edge_presence(x).squeeze(-1)  # shape: (B, T, T)
+        if self.label:
+            label = self.edge_label(x)  # shape: (B, T, T, O_1)
+        return presence, label
+class EdgeBiaffine(nn.Module):
+    def __init__(self, hidden_dim, bottleneck_dim, output_dim, dropout, bias_init=None):
+        super(EdgeBiaffine, self).__init__()
+        self.hidden = nn.Linear(hidden_dim, 2 * bottleneck_dim)
+        self.output = Biaffine(bottleneck_dim, output_dim, bias_init=bias_init)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.dropout(F.elu(self.hidden(x)))  # shape: (B, T, 2H)
+        predecessors, current = x.chunk(2, dim=-1)  # shape: (B, T, H), (B, T, H)
+        edge = self.output(current, predecessors)  # shape: (B, T, T, O)
+        return edge

model/module/encoder.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/env python3
+# coding=utf-8
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel
+from model.module.char_embedding import CharEmbedding
+class WordDropout(nn.Dropout):
+    def forward(self, input_tensor):
+        if self.p == 0:
+            return input_tensor
+        ones = input_tensor.new_ones(input_tensor.shape[:-1])
+        dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
+        return dropout_mask.unsqueeze(-1) * input_tensor
+class Encoder(nn.Module):
+    def __init__(self, args, dataset):
+        super(Encoder, self).__init__()
+        self.dim = args.hidden_size
+        self.n_layers = args.n_encoder_layers
+        self.width_factor = args.query_length
+        self.bert = AutoModel.from_pretrained(args.encoder, add_pooling_layer=False)
+        # self.bert._set_gradient_checkpointing(self.bert.encoder, value=True)
+        if args.encoder_freeze_embedding:
+            self.bert.embeddings.requires_grad_(False)
+            self.bert.embeddings.LayerNorm.requires_grad_(True)
+        if args.freeze_bert:
+            self.bert.requires_grad_(False)
+        self.use_char_embedding = args.char_embedding
+        if self.use_char_embedding:
+            self.form_char_embedding = CharEmbedding(dataset.char_form_vocab_size, args.char_embedding_size, self.dim)
+            self.word_dropout = WordDropout(args.dropout_word)
+        self.post_layer_norm = nn.LayerNorm(self.dim)
+        self.subword_attention = nn.Linear(self.dim, 1)
+        if self.width_factor > 1:
+            self.query_generator = nn.Linear(self.dim, self.dim * self.width_factor)
+        else:
+            self.query_generator = nn.Identity()
+        self.encoded_layer_norm = nn.LayerNorm(self.dim)
+        self.scores = nn.Parameter(torch.zeros(self.n_layers, 1, 1, 1), requires_grad=True)
+    def forward(self, bert_input, form_chars, to_scatter, n_words):
+        tokens, mask = bert_input
+        batch_size = tokens.size(0)
+        encoded = self.bert(tokens, attention_mask=mask, output_hidden_states=True).hidden_states[1:]
+        encoded = torch.stack(encoded, dim=0)  # shape: (12, B, T, H)
+        encoded = self.encoded_layer_norm(encoded)
+        if self.training:
+            time_len = encoded.size(2)
+            scores = self.scores.expand(-1, batch_size, time_len, -1)
+            dropout = torch.empty(self.n_layers, batch_size, 1, 1, dtype=torch.bool, device=self.scores.device)
+            dropout.bernoulli_(0.1)
+            scores = scores.masked_fill(dropout, float("-inf"))
+        else:
+            scores = self.scores
+        scores = F.softmax(scores, dim=0)
+        encoded = (scores * encoded).sum(0)  # shape: (B, T, H)
+        encoded = encoded.masked_fill(mask.unsqueeze(-1) == 0, 0.0)  # shape: (B, T, H)
+        subword_attention = self.subword_attention(encoded) / math.sqrt(self.dim)  # shape: (B, T, 1)
+        subword_attention = subword_attention.expand_as(to_scatter)  # shape: (B, T_subword, T_word)
+        subword_attention = subword_attention.masked_fill(to_scatter == 0, float("-inf"))  # shape: (B, T_subword, T_word)
+        subword_attention = torch.softmax(subword_attention, dim=1)  # shape: (B, T_subword, T_word)
+        subword_attention = subword_attention.masked_fill(to_scatter.sum(1, keepdim=True) == 0, value=0.0)  # shape: (B, T_subword, T_word)
+        encoder_output = torch.einsum("bsd,bsw->bwd", encoded, subword_attention)
+        encoder_output = self.post_layer_norm(encoder_output)
+        if self.use_char_embedding:
+            form_char_embedding = self.form_char_embedding(form_chars[0], form_chars[1], form_chars[2])
+            encoder_output = self.word_dropout(encoder_output) + form_char_embedding
+        decoder_input = self.query_generator(encoder_output)
+        decoder_input = decoder_input.view(batch_size, -1, self.width_factor, self.dim).flatten(1, 2)  # shape: (B, T*Q, D)
+        return encoder_output, decoder_input