import json
import logging

import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from tqdm import tqdm

skip_words = set(stopwords.words('english'))
skip_words.add('\'s')
skip_words.add('.')
skip_words.add(',')
PERSON_NAMES = ['Alex', 'Ash', 'Aspen', 'Bali', 'Berkeley', 'Cameron', 'Chris', 'Cody', 'Dana', 'Drew', 'Emory',
                'Flynn', 'Gale', 'Jamie', 'Jesse',
                'Kai', 'Kendall', 'Kyle', 'Lee', 'Logan', 'Max', 'Morgan', 'Nico', 'Paris', 'Pat', 'Quinn', 'Ray',
                'Robin', 'Rowan', 'Rudy', 'Sam', 'Skylar', 'Sydney',
                'Taylor', 'Tracy', 'West', 'Wynne']
logger = logging.getLogger(__name__)


def accuracy(out, labels):
    return {'acc': (out == labels).mean()}


def handle_words(span, tokenizer, keywords=None, is_start=False):
    inputs = []
    labels = []
    words = nltk.word_tokenize(span)
    for w_i, w in enumerate(words):
        if (w_i == 0 and is_start) or w == '.' or w == ',' or w.startswith('\''):
            w_bpes = tokenizer.tokenize(w)
        else:
            w_bpes = tokenizer.tokenize(w, add_prefix_space=True)
        inputs.extend(w_bpes)
        if keywords != None:
            if w in keywords:
                labels.extend(w_bpes)
            else:
                labels.extend([-100] * len(w_bpes))
        else:
            if w not in PERSON_NAMES and w not in skip_words and w.lower() not in skip_words:
                labels.extend(w_bpes)
            else:
                labels.extend([-100] * len(w_bpes))
    return inputs, labels


def handle_underscores(suffix, tokenizer, keywords=None, prefix=False):
    inputs = []
    labels = []
    if '_' in suffix:
        suffix_parts = [i.strip() for i in suffix.split('___')]
        for i, part in enumerate(suffix_parts):
            if part:
                tmp_inputs, tmp_labels = handle_words(part, tokenizer, keywords=keywords, is_start=(i == 0 and prefix))
                inputs += tmp_inputs
                labels += tmp_labels

                if i != len(suffix_parts) - 1 and suffix_parts[i + 1]:
                    inputs.append(tokenizer.mask_token)
                    labels.append(-100)
            else:
                inputs.append(tokenizer.mask_token)
                labels.append(-100)
    else:
        inputs, labels = handle_words(suffix, tokenizer, keywords=keywords, is_start=prefix)
    return inputs, labels

from tqdm import tqdm
def convert_examples_to_features(examples, tokenizer, max_length=512):
    data = []
    for example in tqdm(examples, desc="converting examples to features"):
        inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example.get('keywords', None), prefix=True)
        choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']]
        input_ids = [inputs + cand[0] for cand in choices]
        input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids]
        label_ids = [labels + cand[1] for cand in choices]
        label_ids = [[t if t == -100 else input_ids[i][t_i] for t_i, t in enumerate(cand)] for i, cand in
                     enumerate(label_ids)]
        label_ids = [[-100] + cand + [-100] for cand in label_ids]
        input_ids = [tokenizer.prepare_for_model(cand, max_length=max_length, truncation=True)['input_ids'] for cand in
                     input_ids]
        data.append([input_ids, label_ids, example['correct']])
    return data


class ATOMICMLMProcessor(object):
    def __init__(self, args):
        self.D = []
        self.filelist = [args.train_file, args.dev_file]

    def get_train_examples(self):
        self.load_data(self.filelist[0])
        return self.D

    def get_dev_examples(self):
        data = []
        with open(self.filelist[1], 'r') as f:
            for row in tqdm(f):
                sample = json.loads(row)
                data.append(sample)
        print(len(data))
        return data

    def load_data(self, filename):
        with open(filename, "r") as f:
            for row in tqdm(f):
                sample = json.loads(row)
                self.D.append({'id': sample['id'], 'context': sample['context'],
                               'ending': sample['candidates'][sample['correct']], 'keywords': sample.get('keywords', None)})
            print(len(self.D))


class ATOMICProcessor(object):
    def __init__(self, args):
        print('loading from %s %s' % (args.train_file, args.dev_file))
        self.filelist = [args.train_file, args.dev_file]
        self.D = [[], []]

    def get_train_examples(self):
        self.load_data(self.filelist[0], 0)
        return self.D[0]

    def get_dev_examples(self):
        self.load_data(self.filelist[1], 1)
        return self.D[1]

    def load_data(self, filename, sid):
        with open(filename, "r") as f:
            for row in tqdm(f):
                sample = json.loads(row)
                self.D[sid].append(sample)
            print(len(self.D[sid]))


class CWWVProcessor(object):
    def __init__(self, args):
        self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
        self.D = [[], []]
        if args.task_name == 'cskg':
            print('loading from %s %s' % (args.second_train_file, args.second_dev_file))
            self.filelist = [args.second_train_file, args.second_dev_file]
        else:
            print('loading from %s %s' % (args.train_file, args.dev_file))
            self.filelist = [args.train_file, args.dev_file]

    def get_train_examples(self):
        self.load_data(self.filelist[0], 0)
        return self.D[0]

    def get_dev_examples(self):
        self.load_data(self.filelist[1], 1)
        return self.D[1]

    def load_data(self, filename, sid):
        skipped = 0
        with open(filename, "r") as f:
            for row in tqdm(f):
                sample = json.loads(row)
                context = sample['question']['stem']
                if context.endswith('.'):
                    context = context[:-1]
                if not context.endswith('[MASK]'):
                    skipped += 1
                    context_parts = context.split('[MASK]')
                    context = context_parts[0].strip()
                    candidates = [c['text'] + context_parts[1] + '.' for c in sample['question']['choices']]
                else:
                    context = context[:-7]
                    candidates = [c['text'] + '.' for c in sample['question']['choices']]
                label = self.answerKey_mapping[sample['answerKey']]
                keywords = nltk.word_tokenize(sample['question']['head'])
                keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
                self.D[sid].append({'id': sample['id'], 'context': context, 'correct': label, 'candidates': candidates,
                                    'keywords': keywords})
            print(len(self.D[sid]), skipped)


class CWWVMLMProcessor(object):
    def __init__(self, args):
        self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
        self.D = []
        self.filelist = [args.train_file, args.dev_file]
        self.args = args

    def get_train_examples(self):
        self.load_data(self.filelist[0])
        return self.D

    def get_dev_examples(self):
        processor = CSKGProcessor(self.args)
        return processor.get_dev_examples()

    def load_data(self, filename):
        skipped = 0
        with open(filename, "r") as f:
            for row in tqdm(f):
                sample = json.loads(row)
                context = sample['question']['stem']
                if context.endswith('.'):
                    context = context[:-1]
                assert context.endswith('[MASK]')
                context = context[:-7]
                candidates = [c['text'] + '.' for c in sample['question']['choices']]
                label = self.answerKey_mapping[sample['answerKey']]
                keywords = nltk.word_tokenize(sample['question']['head'])
                keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
                self.D.append(
                    {'id': sample['id'], 'context': context, 'ending': candidates[label], 'keywords': keywords})
            print(len(self.D))


class CSKGProcessor(object):
    def __init__(self, args):
        # CWWV set always uses second train/dev file params
        self.atomicprocessor = ATOMICProcessor(args)
        self.cwwvprocessor = CWWVProcessor(args)

    def get_train_examples(self):
        cwwv_questions = self.cwwvprocessor.get_train_examples()
        atomic_questions = self.atomicprocessor.get_train_examples()
        return cwwv_questions + atomic_questions

    def get_dev_examples(self):
        cwwv_questions = self.cwwvprocessor.get_dev_examples()
        atomic_questions = self.atomicprocessor.get_dev_examples()
        return cwwv_questions + atomic_questions


myprocessors = {
    "atomic": ATOMICProcessor,
    "cwwv": CWWVProcessor,
    "atomicmlm": ATOMICMLMProcessor,
    "cwwvmlm": CWWVMLMProcessor,
    "cskg": CSKGProcessor
}