import pandas as pd
import torch
import numpy as np
import openml


def get_openml_classification(did, max_samples, multiclass=True, shuffled=True):
    dataset = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )

    if not multiclass:
        X = X[y < 2]
        y = y[y < 2]

    if multiclass and not shuffled:
        raise NotImplementedError("This combination of multiclass and shuffling isn't implemented")

    if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
        print('Not a NP Array, skipping')
        return None, None, None, None

    if not shuffled:
        sort = np.argsort(y) if y.mean() < 0.5 else np.argsort(-y)
        pos = int(y.sum()) if y.mean() < 0.5 else int((1 - y).sum())
        X, y = X[sort][-pos * 2:], y[sort][-pos * 2:]
        y = torch.tensor(y).reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).float()
        X = torch.tensor(X).reshape(2, -1, X.shape[1]).transpose(0, 1).reshape(-1, X.shape[1]).flip([0]).float()
    else:
        order = np.arange(y.shape[0])
        np.random.seed(13)
        np.random.shuffle(order)
        X, y = torch.tensor(X[order]), torch.tensor(y[order])
    if max_samples:
        X, y = X[:max_samples], y[:max_samples]

    return X, y, list(np.where(categorical_indicator)[0]), attribute_names

def load_openml_list(dids, filter_for_nan=False
                     , num_feats=100
                     , min_samples = 100
                     , max_samples=400
                     , multiclass=True
                     , max_num_classes=10
                     , shuffled=True
                     , return_capped = False):
    datasets = []
    openml_list = openml.datasets.list_datasets(dids)
    print(f'Number of datasets: {len(openml_list)}')

    datalist = pd.DataFrame.from_dict(openml_list, orient="index")
    if filter_for_nan:
        datalist = datalist[datalist['NumberOfInstancesWithMissingValues'] == 0]
        print(f'Number of datasets after Nan and feature number filtering: {len(datalist)}')

    for ds in datalist.index:
        modifications = {'samples_capped': False, 'classes_capped': False, 'feats_capped': False}
        entry = datalist.loc[ds]

        print('Loading', entry['name'], entry.did, '..')

        if entry['NumberOfClasses'] == 0.0:
            raise Exception("Regression not supported")
            #X, y, categorical_feats, attribute_names = get_openml_regression(int(entry.did), max_samples)
        else:
            X, y, categorical_feats, attribute_names = get_openml_classification(int(entry.did), max_samples
                                                                , multiclass=multiclass, shuffled=shuffled)
        if X is None:
            continue

        if X.shape[1] > num_feats:
            if return_capped:
                X = X[:, 0:num_feats]
                categorical_feats = [c for c in categorical_feats if c < num_feats]
                modifications['feats_capped'] = True
            else:
                print('Too many features')
                continue
        if X.shape[0] == max_samples:
            modifications['samples_capped'] = True

        if X.shape[0] < min_samples:
            print(f'Too few samples left')
            continue

        if len(np.unique(y)) > max_num_classes:
            if return_capped:
                X = X[y < np.unique(y)[10]]
                y = y[y < np.unique(y)[10]]
                modifications['classes_capped'] = True
            else:
                print(f'Too many classes')
                continue

        datasets += [[entry['name'], X, y, categorical_feats, attribute_names, modifications]]

    return datasets, datalist


# Classification
valid_dids_classification = [13, 59, 4, 15, 40710, 43, 1498]
test_dids_classification = [973, 1596, 40981, 1468, 40984, 40975, 41163, 41147, 1111, 41164, 1169, 1486, 41143, 1461, 41167, 40668, 41146, 41169, 41027, 23517, 41165, 41161, 41159, 41138, 1590, 41166, 1464, 41168, 41150, 1489, 41142, 3, 12, 31, 54, 1067]
valid_large_classification = [  943, 23512,    49,   838,  1131,   767,  1142,   748,  1112,
        1541,   384,   912,  1503,   796,    20,    30,   903,  4541,
         961,   805,  1000,  4135,  1442,   816,  1130,   906,  1511,
         184,   181,   137,  1452,  1481,   949,   449,    50,   913,
        1071,   831,   843,     9,   896,  1532,   311,    39,   451,
         463,   382,   778,   474,   737,  1162,  1538,   820,   188,
         452,  1156,    37,   957,   911,  1508,  1054,   745,  1220,
         763,   900,    25,   387,    38,   757,  1507,   396,  4153,
         806,   779,   746,  1037,   871,   717,  1480,  1010,  1016,
         981,  1547,  1002,  1126,  1459,   846,   837,  1042,   273,
        1524,   375,  1018,  1531,  1458,  6332,  1546,  1129,   679,
         389]

open_cc_dids = [11,
 14,
 15,
 16,
 18,
 22,
 23,
 29,
 31,
 37,
 50,
 54,
 188,
 458,
 469,
 1049,
 1050,
 1063,
 1068,
 1510,
 1494,
 1480,
 1462,
 1464,
 6332,
 23381,
 40966,
 40982,
 40994,
 40975]
# Filtered by N_samples < 2000, N feats < 100, N classes < 10

open_cc_valid_dids = [13,25,35,40,41,43,48,49,51,53,55,56,59,61,187,285,329,333,334,335,336,337,338,377,446,450,451,452,460,463,464,466,470,475,481,679,694,717,721,724,733,738,745,747,748,750,753,756,757,764,765,767,774,778,786,788,795,796,798,801,802,810,811,814,820,825,826,827,831,839,840,841,844,852,853,854,860,880,886,895,900,906,907,908,909,915,925,930,931,934,939,940,941,949,966,968,984,987,996,1048,1054,1071,1073,1100,1115,1412,1442,1443,1444,1446,1447,1448,1451,1453,1488,1490,1495,1498,1499,1506,1508,1511,1512,1520,1523,4153,23499,40496,40646,40663,40669,40680,40682,40686,40690,40693,40705,40706,40710,40711,40981,41430,41538,41919,41976,42172,42261,42544,42585,42638]