# !/usr/bin/env python3
# -*- coding: utf-8 -*-

#Utility functions to create datasets.

from typing import List

import pandas as pd

import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer

MAPPING = {
    'Fungal infection': 0,
    'Allergy': 1,
    'GERD': 2,
    'Chronic cholestasis': 3,
    'Drug Reaction': 4,
    'Peptic ulcer diseae': 5,
    'AIDS': 6,
    'Diabetes ': 7,
    'Gastroenteritis': 8,
    'Bronchial Asthma': 9,
    'Hypertension ': 10,
    'Migraine': 11,
    'Cervical spondylosis': 12,
    'Paralysis (brain hemorrhage)': 13,
    'Jaundice': 14,
    'Malaria': 15,
    'Chicken pox': 16,
    'Dengue': 17,
    'Typhoid': 18,
    'hepatitis A': 19,
    'Hepatitis B': 20,
    'Hepatitis C': 21,
    'Hepatitis D': 22,
    'Hepatitis E': 23,
    'Alcoholic hepatitis': 24,
    'Tuberculosis': 25,
    'Common Cold': 26,
    'Pneumonia': 27,
    'Dimorphic hemmorhoids(piles)': 28,
    'Heart attack': 29,
    'Varicose veins': 30,
    'Hypothyroidism': 31,
    'Hyperthyroidism': 32,
    'Hypoglycemia': 33,
    'Osteoarthristis': 34,
    'Arthritis': 35,
    '(vertigo) Paroymsal  Positional Vertigo': 36,
    'Acne': 37,
    'Urinary tract infection': 38,
    'Psoriasis': 39,
    'Impetigo': 40
}

REVERSE_MAPPING = {v: k for k, v in MAPPING.items()}


class DiseasePrognosisDataset(Dataset):
    """Dataset with symptom strings to predict disease.

    Args:
        symptoms (List[str]): list of symptom strings
        prognosis (List[str]): list of corresponding prognosis
    """

    MAPPING = MAPPING

    def __init__(
            self,
            symptoms: List[str],
            prognosis: List[str],
            tokenizer: PreTrainedTokenizer,
            max_length: int = 64):

        self.symptoms = symptoms
        self.prognosis = prognosis
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.symptoms)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.symptoms[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True)
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.prognosis is not None:
            return (item, torch.as_tensor(self.MAPPING[self.prognosis[idx]]))
        return item


def read_and_preprocess_data(
    data_path: str,
    tokenizer: PreTrainedTokenizer,
    max_length: int = 64,
    include_label: bool = True
) -> Dataset:
    """read, preprocess data, and create a Dataset with a pretrained tokenizer

    Args:
        data_path (str): path to dataset
        tokenizer (PreTrainedTokenizer): tokenizer to use
        max_length (int): max length for tokenization
        include_label (bool): Whether to include the label field for

    Return:
        Dataset : preprocessed Dataset
    """

    data = pd.read_csv(data_path)
    if include_label:
        prognosis = data['prognosis']
    symptoms = data['symptoms']

    return DiseasePrognosisDataset(
        symptoms.values,
        prognosis.values if include_label else None,
        tokenizer,
        max_length=max_length
    )