disease-prediction-nlp / process_data.py
udyan2's picture
Upload 3 files
9ea24fa
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
#Utility functions to create datasets.
from typing import List
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
MAPPING = {
'Fungal infection': 0,
'Allergy': 1,
'GERD': 2,
'Chronic cholestasis': 3,
'Drug Reaction': 4,
'Peptic ulcer diseae': 5,
'AIDS': 6,
'Diabetes ': 7,
'Gastroenteritis': 8,
'Bronchial Asthma': 9,
'Hypertension ': 10,
'Migraine': 11,
'Cervical spondylosis': 12,
'Paralysis (brain hemorrhage)': 13,
'Jaundice': 14,
'Malaria': 15,
'Chicken pox': 16,
'Dengue': 17,
'Typhoid': 18,
'hepatitis A': 19,
'Hepatitis B': 20,
'Hepatitis C': 21,
'Hepatitis D': 22,
'Hepatitis E': 23,
'Alcoholic hepatitis': 24,
'Tuberculosis': 25,
'Common Cold': 26,
'Pneumonia': 27,
'Dimorphic hemmorhoids(piles)': 28,
'Heart attack': 29,
'Varicose veins': 30,
'Hypothyroidism': 31,
'Hyperthyroidism': 32,
'Hypoglycemia': 33,
'Osteoarthristis': 34,
'Arthritis': 35,
'(vertigo) Paroymsal Positional Vertigo': 36,
'Acne': 37,
'Urinary tract infection': 38,
'Psoriasis': 39,
'Impetigo': 40
}
REVERSE_MAPPING = {v: k for k, v in MAPPING.items()}
class DiseasePrognosisDataset(Dataset):
"""Dataset with symptom strings to predict disease.
Args:
symptoms (List[str]): list of symptom strings
prognosis (List[str]): list of corresponding prognosis
"""
MAPPING = MAPPING
def __init__(
self,
symptoms: List[str],
prognosis: List[str],
tokenizer: PreTrainedTokenizer,
max_length: int = 64):
self.symptoms = symptoms
self.prognosis = prognosis
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.symptoms)
def __getitem__(self, idx):
encoding = self.tokenizer(
self.symptoms[idx],
max_length=self.max_length,
padding='max_length',
truncation=True)
item = {key: torch.as_tensor(val) for key, val in encoding.items()}
if self.prognosis is not None:
return (item, torch.as_tensor(self.MAPPING[self.prognosis[idx]]))
return item
def read_and_preprocess_data(
data_path: str,
tokenizer: PreTrainedTokenizer,
max_length: int = 64,
include_label: bool = True
) -> Dataset:
"""read, preprocess data, and create a Dataset with a pretrained tokenizer
Args:
data_path (str): path to dataset
tokenizer (PreTrainedTokenizer): tokenizer to use
max_length (int): max length for tokenization
include_label (bool): Whether to include the label field for
Return:
Dataset : preprocessed Dataset
"""
data = pd.read_csv(data_path)
if include_label:
prognosis = data['prognosis']
symptoms = data['symptoms']
return DiseasePrognosisDataset(
symptoms.values,
prognosis.values if include_label else None,
tokenizer,
max_length=max_length
)