svjack's picture
Upload . with huggingface_hub
import os
import copy
import json
import logging
import torch
from import TensorDataset
from utils import get_intent_labels, get_slot_labels
logger = logging.getLogger(__name__)
class InputExample(object):
A single training/test example for simple sequence classification.
guid: Unique id for the example.
words: list. The words of the sequence.
intent_label: (Optional) string. The intent label of the example.
slot_labels: (Optional) list. The slot labels of the example.
def __init__(self, guid, words, intent_label=None, slot_labels=None):
self.guid = guid
self.words = words
self.intent_label = intent_label
self.slot_labels = slot_labels
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
self.input_ids = input_ids
self.attention_mask = attention_mask
self.token_type_ids = token_type_ids
self.intent_label_id = intent_label_id
self.slot_labels_ids = slot_labels_ids
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class JointProcessor(object):
"""Processor for the JointBERT data set """
def __init__(self, args):
self.args = args
self.intent_labels = get_intent_labels(args)
self.slot_labels = get_slot_labels(args)
self.input_text_file = ''
self.intent_label_file = 'label'
self.slot_labels_file = 'seq.out'
def _read_file(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8") as f:
lines = []
for line in f:
return lines
def _create_examples(self, texts, intents, slots, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
guid = "%s-%s" % (set_type, i)
# 1. input_text
words = text.split() # Some are spaced twice
# 2. intent
intent_label = self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
# 3. slot
slot_labels = []
for s in slot.split():
slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))
assert len(words) == len(slot_labels)
examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
return examples
def get_examples(self, mode):
mode: train, dev, test
data_path = os.path.join(self.args.data_dir, self.args.task, mode)"LOOKING AT {}".format(data_path))
return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
processors = {
"atis": JointProcessor,
"snips": JointProcessor
def convert_examples_to_features(examples, max_seq_len, tokenizer,
# Setting based on the current model type
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
unk_token = tokenizer.unk_token
pad_token_id = tokenizer.pad_token_id
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 5000 == 0:"Writing example %d of %d" % (ex_index, len(examples)))
# Tokenize word by word (for NER)
tokens = []
slot_labels_ids = []
for word, slot_label in zip(example.words, example.slot_labels):
word_tokens = tokenizer.tokenize(word)
if not word_tokens:
word_tokens = [unk_token] # For handling the bad-encoded word
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))
# Account for [CLS] and [SEP]
special_tokens_count = 2
if len(tokens) > max_seq_len - special_tokens_count:
tokens = tokens[:(max_seq_len - special_tokens_count)]
slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]
# Add [SEP] token
tokens += [sep_token]
slot_labels_ids += [pad_token_label_id]
token_type_ids = [sequence_a_segment_id] * len(tokens)
# Add [CLS] token
tokens = [cls_token] + tokens
slot_labels_ids = [pad_token_label_id] + slot_labels_ids
token_type_ids = [cls_token_segment_id] + token_type_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length.
padding_length = max_seq_len - len(input_ids)
input_ids = input_ids + ([pad_token_id] * padding_length)
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)
intent_label_id = int(example.intent_label)
if ex_index < 5:"*** Example ***")"guid: %s" % example.guid)"tokens: %s" % " ".join([str(x) for x in tokens]))"input_ids: %s" % " ".join([str(x) for x in input_ids]))"attention_mask: %s" % " ".join([str(x) for x in attention_mask]))"token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))"intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))"slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))
return features
def load_and_cache_examples(args, tokenizer, mode):
processor = processors[args.task](args)
# Load data features from cache or dataset file
cached_features_file = os.path.join(
list(filter(None, args.model_name_or_path.split("/"))).pop(),
if os.path.exists(cached_features_file):"Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file)
# Load data features from dataset file"Creating features from dataset file at %s", args.data_dir)
if mode == "train":
examples = processor.get_examples("train")
elif mode == "dev":
examples = processor.get_examples("dev")
elif mode == "test":
examples = processor.get_examples("test")
raise Exception("For mode, Only train, dev, test is available")
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
pad_token_label_id = args.ignore_index
features = convert_examples_to_features(examples, args.max_seq_len, tokenizer,
pad_token_label_id=pad_token_label_id)"Saving features into cached file %s", cached_features_file), cached_features_file)
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_attention_mask,
all_token_type_ids, all_intent_label_ids, all_slot_labels_ids)
return dataset