"""
=========================================================================================
Trojan VQA
Written by Matthew Walmer

Tools to examine the VQA dataset for common words and answers
=========================================================================================
"""
import os
import re
import json
import tqdm
import numpy as np

from openvqa.openvqa.utils.ans_punct import prep_ans

# get the k most frequent answers in the train set
# check mode - lets you check how frequently a give word happens
def most_frequent_answers(k=50, verbose=False, check=None):
    file = 'data/clean/v2_mscoco_train2014_annotations.json'
    cache = 'utils/train_ans_counts.json'
    # load or compute answer counts
    if os.path.isfile(cache):
        with open(cache, 'r') as f:
            all_answers = json.load(f)
    else:
        with open(file, 'r') as f:
            data = json.load(f)
        annotations = data['annotations']
        all_answers = {}
        for anno in tqdm.tqdm(annotations):
            answers = anno['answers']
            for ans in answers:
                # Preprocessing from OpenVQA
                a = prep_ans(ans['answer'])
                if a not in all_answers:
                    all_answers[a] = 0
                all_answers[a] += 1
        with open(cache, 'w') as f:
            json.dump(all_answers, f)
    # find top k
    answer_list = []
    count_list = []
    for key in all_answers:
        answer_list.append(key)
        count_list.append(all_answers[key])
    count_list = np.array(count_list)
    tot_answers = np.sum(count_list)
    idx_srt = np.argsort(-1 * count_list)
    top_k = []
    for i in range(k):
        top_k.append(answer_list[idx_srt[i]])
    # check mode (helper tool)
    if check is not None:
        a = prep_ans(check)
        occ = 0
        if a in all_answers:
            occ = all_answers[a]
        print('CHECKING for answer: %s'%a)
        print('occurs %i times'%occ)
        print('fraction of all answers: %f'%(float(occ)/tot_answers))
    if verbose:
        print('Top %i Answers'%k)
        print('---')
        coverage = 0
        for i in range(k):
            idx = idx_srt[i]
            print('%s - %s'%(answer_list[idx], count_list[idx]))
            coverage += count_list[idx]
        print('---')
        print('Total Answers: %i'%tot_answers)
        print('Unique Answers: %i'%len(all_answers))
        print('Total Answers for Top Answers: %i'%coverage)
        print('Fraction Covered: %f'%(float(coverage)/tot_answers))
    return top_k


# get the k most frequent question first words in the train set
# check mode - lets you check how frequently a give word happens
def most_frequent_first_words(k=50, verbose=False, check=None):
    file = 'data/clean/v2_OpenEnded_mscoco_train2014_questions.json'
    cache = 'utils/train_fw_counts.json'
    # load or compute answer counts
    if os.path.isfile(cache):
        with open(cache, 'r') as f:
            first_words = json.load(f)
    else:
        with open(file, 'r') as f:
            data = json.load(f)
        questions = data['questions']
        first_words = {}
        for ques in tqdm.tqdm(questions):
            # pre-processing from OpenVQA:
            words = re.sub(r"([.,'!?\"()*#:;])", '', ques['question'].lower() ).replace('-', ' ').replace('/', ' ').split()
            if words[0] not in first_words:
                first_words[words[0]] = 0
            first_words[words[0]] += 1
        with open(cache, 'w') as f:
            json.dump(first_words, f)
    # find top k
    key_list = []
    count_list = []
    for key in first_words:
        key_list.append(key)
        count_list.append(first_words[key])
    count_list = np.array(count_list)
    tot_proc = np.sum(count_list)
    idx_srt = np.argsort(-1 * count_list)
    top_k = []
    for i in range(k):
        top_k.append(key_list[idx_srt[i]])
    # check mode (helper tool)
    if check is not None:
        w = re.sub(r"([.,'!?\"()*#:;])", '', check.lower() ).replace('-', ' ').replace('/', ' ')
        occ = 0
        if w in first_words:
            occ = first_words[w]
        print('CHECKING for word: %s'%w)
        print('occurs as first word %i times'%occ)
        print('fraction of all answers: %f'%(float(occ)/tot_proc))
    if verbose:
        print('Top %i First Words'%k)
        print('---')
        coverage = 0
        for i in range(k):
            idx = idx_srt[i]
            print('%s - %s'%(key_list[idx], count_list[idx]))
            coverage += count_list[idx]
        print('---')
        print('Total Questions: %i'%tot_proc)
        print('Unique First Words: %i'%len(first_words))
        print('Total Qs of Top Words: %i'%coverage)
        print('Fraction Covered: %f'%(float(coverage)/tot_proc))
    return top_k