import copy import json import logging import os import torch from torch.utils.data import TensorDataset from utils import get_intent_labels, get_slot_labels logger = logging.getLogger(__name__) class InputExample(object): """ A single training/test example for simple sequence classification. Args: guid: Unique id for the example. words: list. The words of the sequence. intent_label: (Optional) string. The intent label of the example. slot_labels: (Optional) list. The slot labels of the example. """ def __init__(self, guid, words, intent_label=None, slot_labels=None): self.guid = guid self.words = words self.intent_label = intent_label self.slot_labels = slot_labels def __repr__(self): return str(self.to_json_string()) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" class InputFeatures(object): """A single set of features of data.""" def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids): self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.intent_label_id = intent_label_id self.slot_labels_ids = slot_labels_ids def __repr__(self): return str(self.to_json_string()) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" class JointProcessor(object): """Processor for the JointBERT data set """ def __init__(self, args): self.args = args self.intent_labels = get_intent_labels(args) self.slot_labels = get_slot_labels(args) self.input_text_file = "seq.in" self.intent_label_file = "label" self.slot_labels_file = "seq.out" @classmethod def _read_file(cls, input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, "r", encoding="utf-8") as f: lines = [] for line in f: lines.append(line.strip()) return lines def _create_examples(self, texts, intents, slots, set_type): """Creates examples for the training and dev sets.""" examples = [] for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)): guid = "%s-%s" % (set_type, i) # 1. input_text words = text.split() # Some are spaced twice # 2. intent intent_label = ( self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK") ) # 3. slot slot_labels = [] for s in slot.split(): slot_labels.append( self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK") ) assert len(words) == len(slot_labels) examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels)) return examples def get_examples(self, mode): """ Args: mode: train, dev, test """ data_path = os.path.join(self.args.data_dir, self.args.token_level, mode) logger.info("LOOKING AT {}".format(data_path)) return self._create_examples( texts=self._read_file(os.path.join(data_path, self.input_text_file)), intents=self._read_file(os.path.join(data_path, self.intent_label_file)), slots=self._read_file(os.path.join(data_path, self.slot_labels_file)), set_type=mode, ) processors = {"syllable-level": JointProcessor, "word-level": JointProcessor} def convert_examples_to_features( examples, max_seq_len, tokenizer, pad_token_label_id=-100, cls_token_segment_id=0, pad_token_segment_id=0, sequence_a_segment_id=0, mask_padding_with_zero=True, ): # Setting based on the current model type cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token unk_token = tokenizer.unk_token pad_token_id = tokenizer.pad_token_id features = [] for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) # Tokenize word by word (for NER) tokens = [] slot_labels_ids = [] for word, slot_label in zip(example.words, example.slot_labels): word_tokens = tokenizer.tokenize(word) if not word_tokens: word_tokens = [unk_token] # For handling the bad-encoded word tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] special_tokens_count = 2 if len(tokens) > max_seq_len - special_tokens_count: tokens = tokens[: (max_seq_len - special_tokens_count)] slot_labels_ids = slot_labels_ids[: (max_seq_len - special_tokens_count)] # Add [SEP] token tokens += [sep_token] slot_labels_ids += [pad_token_label_id] token_type_ids = [sequence_a_segment_id] * len(tokens) # Add [CLS] token tokens = [cls_token] + tokens slot_labels_ids = [pad_token_label_id] + slot_labels_ids token_type_ids = [cls_token_segment_id] + token_type_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_len - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length) assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len) assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format( len(attention_mask), max_seq_len ) assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format( len(token_type_ids), max_seq_len ) assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format( len(slot_labels_ids), max_seq_len ) intent_label_id = int(example.intent_label) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % example.guid) logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id)) logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids])) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, intent_label_id=intent_label_id, slot_labels_ids=slot_labels_ids, ) ) return features def load_and_cache_examples(args, tokenizer, mode): processor = processors[args.token_level](args) # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( mode, args.token_level, list(filter(None, args.model_name_or_path.split("/"))).pop(), args.max_seq_len ), ) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: # Load data features from dataset file logger.info("Creating features from dataset file at %s", args.data_dir) if mode == "train": examples = processor.get_examples("train") elif mode == "dev": examples = processor.get_examples("dev") elif mode == "test": examples = processor.get_examples("test") else: raise Exception("For mode, Only train, dev, test is available") # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = args.ignore_index features = convert_examples_to_features( examples, args.max_seq_len, tokenizer, pad_token_label_id=pad_token_label_id ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long) all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_mask, all_token_type_ids, all_intent_label_ids, all_slot_labels_ids ) return dataset