tqfang229's picture
Upload folder using huggingface_hub
2e3f432 verified
import json
import logging
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from tqdm import tqdm
skip_words = set(stopwords.words('english'))
skip_words.add('\'s')
skip_words.add('.')
skip_words.add(',')
PERSON_NAMES = ['Alex', 'Ash', 'Aspen', 'Bali', 'Berkeley', 'Cameron', 'Chris', 'Cody', 'Dana', 'Drew', 'Emory',
'Flynn', 'Gale', 'Jamie', 'Jesse',
'Kai', 'Kendall', 'Kyle', 'Lee', 'Logan', 'Max', 'Morgan', 'Nico', 'Paris', 'Pat', 'Quinn', 'Ray',
'Robin', 'Rowan', 'Rudy', 'Sam', 'Skylar', 'Sydney',
'Taylor', 'Tracy', 'West', 'Wynne']
logger = logging.getLogger(__name__)
def accuracy(out, labels):
return {'acc': (out == labels).mean()}
def handle_words(span, tokenizer, keywords=None, is_start=False):
inputs = []
labels = []
words = nltk.word_tokenize(span)
for w_i, w in enumerate(words):
if (w_i == 0 and is_start) or w == '.' or w == ',' or w.startswith('\''):
w_bpes = tokenizer.tokenize(w)
else:
w_bpes = tokenizer.tokenize(w, add_prefix_space=True)
inputs.extend(w_bpes)
if keywords != None:
if w in keywords:
labels.extend(w_bpes)
else:
labels.extend([-100] * len(w_bpes))
else:
if w not in PERSON_NAMES and w not in skip_words and w.lower() not in skip_words:
labels.extend(w_bpes)
else:
labels.extend([-100] * len(w_bpes))
return inputs, labels
def handle_underscores(suffix, tokenizer, keywords=None, prefix=False):
inputs = []
labels = []
if '_' in suffix:
suffix_parts = [i.strip() for i in suffix.split('___')]
for i, part in enumerate(suffix_parts):
if part:
tmp_inputs, tmp_labels = handle_words(part, tokenizer, keywords=keywords, is_start=(i == 0 and prefix))
inputs += tmp_inputs
labels += tmp_labels
if i != len(suffix_parts) - 1 and suffix_parts[i + 1]:
inputs.append(tokenizer.mask_token)
labels.append(-100)
else:
inputs.append(tokenizer.mask_token)
labels.append(-100)
else:
inputs, labels = handle_words(suffix, tokenizer, keywords=keywords, is_start=prefix)
return inputs, labels
from tqdm import tqdm
def convert_examples_to_features(examples, tokenizer, max_length=512):
data = []
for example in tqdm(examples, desc="converting examples to features"):
inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example.get('keywords', None), prefix=True)
choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']]
input_ids = [inputs + cand[0] for cand in choices]
input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids]
label_ids = [labels + cand[1] for cand in choices]
label_ids = [[t if t == -100 else input_ids[i][t_i] for t_i, t in enumerate(cand)] for i, cand in
enumerate(label_ids)]
label_ids = [[-100] + cand + [-100] for cand in label_ids]
input_ids = [tokenizer.prepare_for_model(cand, max_length=max_length, truncation=True)['input_ids'] for cand in
input_ids]
data.append([input_ids, label_ids, example['correct']])
return data
class ATOMICMLMProcessor(object):
def __init__(self, args):
self.D = []
self.filelist = [args.train_file, args.dev_file]
def get_train_examples(self):
self.load_data(self.filelist[0])
return self.D
def get_dev_examples(self):
data = []
with open(self.filelist[1], 'r') as f:
for row in tqdm(f):
sample = json.loads(row)
data.append(sample)
print(len(data))
return data
def load_data(self, filename):
with open(filename, "r") as f:
for row in tqdm(f):
sample = json.loads(row)
self.D.append({'id': sample['id'], 'context': sample['context'],
'ending': sample['candidates'][sample['correct']], 'keywords': sample.get('keywords', None)})
print(len(self.D))
class ATOMICProcessor(object):
def __init__(self, args):
print('loading from %s %s' % (args.train_file, args.dev_file))
self.filelist = [args.train_file, args.dev_file]
self.D = [[], []]
def get_train_examples(self):
self.load_data(self.filelist[0], 0)
return self.D[0]
def get_dev_examples(self):
self.load_data(self.filelist[1], 1)
return self.D[1]
def load_data(self, filename, sid):
with open(filename, "r") as f:
for row in tqdm(f):
sample = json.loads(row)
self.D[sid].append(sample)
print(len(self.D[sid]))
class CWWVProcessor(object):
def __init__(self, args):
self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
self.D = [[], []]
if args.task_name == 'cskg':
print('loading from %s %s' % (args.second_train_file, args.second_dev_file))
self.filelist = [args.second_train_file, args.second_dev_file]
else:
print('loading from %s %s' % (args.train_file, args.dev_file))
self.filelist = [args.train_file, args.dev_file]
def get_train_examples(self):
self.load_data(self.filelist[0], 0)
return self.D[0]
def get_dev_examples(self):
self.load_data(self.filelist[1], 1)
return self.D[1]
def load_data(self, filename, sid):
skipped = 0
with open(filename, "r") as f:
for row in tqdm(f):
sample = json.loads(row)
context = sample['question']['stem']
if context.endswith('.'):
context = context[:-1]
if not context.endswith('[MASK]'):
skipped += 1
context_parts = context.split('[MASK]')
context = context_parts[0].strip()
candidates = [c['text'] + context_parts[1] + '.' for c in sample['question']['choices']]
else:
context = context[:-7]
candidates = [c['text'] + '.' for c in sample['question']['choices']]
label = self.answerKey_mapping[sample['answerKey']]
keywords = nltk.word_tokenize(sample['question']['head'])
keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
self.D[sid].append({'id': sample['id'], 'context': context, 'correct': label, 'candidates': candidates,
'keywords': keywords})
print(len(self.D[sid]), skipped)
class CWWVMLMProcessor(object):
def __init__(self, args):
self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
self.D = []
self.filelist = [args.train_file, args.dev_file]
self.args = args
def get_train_examples(self):
self.load_data(self.filelist[0])
return self.D
def get_dev_examples(self):
processor = CSKGProcessor(self.args)
return processor.get_dev_examples()
def load_data(self, filename):
skipped = 0
with open(filename, "r") as f:
for row in tqdm(f):
sample = json.loads(row)
context = sample['question']['stem']
if context.endswith('.'):
context = context[:-1]
assert context.endswith('[MASK]')
context = context[:-7]
candidates = [c['text'] + '.' for c in sample['question']['choices']]
label = self.answerKey_mapping[sample['answerKey']]
keywords = nltk.word_tokenize(sample['question']['head'])
keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
self.D.append(
{'id': sample['id'], 'context': context, 'ending': candidates[label], 'keywords': keywords})
print(len(self.D))
class CSKGProcessor(object):
def __init__(self, args):
# CWWV set always uses second train/dev file params
self.atomicprocessor = ATOMICProcessor(args)
self.cwwvprocessor = CWWVProcessor(args)
def get_train_examples(self):
cwwv_questions = self.cwwvprocessor.get_train_examples()
atomic_questions = self.atomicprocessor.get_train_examples()
return cwwv_questions + atomic_questions
def get_dev_examples(self):
cwwv_questions = self.cwwvprocessor.get_dev_examples()
atomic_questions = self.atomicprocessor.get_dev_examples()
return cwwv_questions + atomic_questions
myprocessors = {
"atomic": ATOMICProcessor,
"cwwv": CWWVProcessor,
"atomicmlm": ATOMICMLMProcessor,
"cwwvmlm": CWWVMLMProcessor,
"cskg": CSKGProcessor
}