# 'dataset' holds the input data for this script
import re
import math
import pandas
from local_utils import date_time_col, time_col, man_col, dur_col, round_to_n, to_sec
import json
import os
import re
import pandas
from local_utils import date_time_col, dur_col, date_col, s1, s2, l1, l2, well_key, flow_key
from local_utils import restructure, try_key, column_matcher, file_matcher, file_matcher2, split_join as spj
id_col = "index"


def split_join(dt, info):
    joined = []
    for i in info:
        print(f'\n\nNow working on {i["data-column"]} column\n')
        data = dt.drop(axis=1, columns=i["non-columns"])
        data.rename(columns={i["data-column"]: man_col}, inplace=True)
        data.insert(2, "Well index", [i["well-index"] for x in range(data.shape[0])], True)
        print(f"{data.shape[0]} rows before drop and merge")
        data.drop_duplicates(inplace=True, subset=[time_col])
        i["dataset"].drop_duplicates(inplace=True, subset=[time_col])
        print(data.head())
        print(i["dataset"].head())
        data = data.merge(i["dataset"], how='left', on=[time_col, dur_col])
        print(data.head())
        print(f"{data.shape[0]} rows after drop and merge")
        joined.append(data)

    return pandas.concat(joined, ignore_index=True)


def get_22_data():
    dataset = pandas.read_csv("input/flowstation.csv")
    dataset_1S = pandas.read_csv("input/1S.csv")
    dataset_1L = pandas.read_csv("input/1L.csv")
    dataset_2S = pandas.read_csv("input/2S.csv")
    dataset_2L = pandas.read_csv("input/2L.csv")

    print(dataset.head())

    for dat in [dataset, dataset_1S, dataset_1L, dataset_2L, dataset_2S]:
        count = 0
        duration = []
        times = []
        dat.dropna(axis=0, how="any", inplace=True)
        dat.reset_index(inplace=True)
        print(".", end="\t")

        for datetime in dat[date_time_col]:
            date_time = re.sub("\.0(?=\\s)", "", datetime)
            datetime_array = date_time.split()
            date = datetime_array[0].split("/")

            time_array = datetime_array[1].split(":")

            if datetime_array[2] == "PM" and time_array[0] != "12":
                hour = int(time_array[0]) + 12
            elif datetime_array[2] == "AM" and time_array[0] == "12":
                hour = int(time_array[0]) - 12
            else:
                hour = time_array[0]

            minutes = time_array[1]
            sec = round_to_n(int(time_array[2]), 1)

            if sec == 60:
                sec = "00"
                minutes = int(minutes) + 1

            if minutes == 60:
                minutes = "00"
                hour = int(hour) + 1

            if hour == 24:
                hour = "00"
                date[1] = int(date[1]) + 1

            duration.append(to_sec(hour, minutes, sec))
            times.append(f"{hour}:{minutes}:{sec}")
            date_time = f"{date[1]}/{date[0]}/{date[2]} {datetime_array[1]} {datetime_array[2]}"

            dat.loc[count, date_time_col] = date_time
            count += 1

        dat.insert(1, dur_col, duration, True)
        dat.insert(2, time_col, times, True)

        dat.drop(axis=1, columns=["#", date_time_col], inplace=True, errors="ignore")

    info_1S = {
        "non-columns": ["index", "NW1L, PSI (LGR S/N: 20705686)", "NW2L, PSI (LGR S/N: 20705686)",
                        "NW2S, PSI (LGR S/N: 20705686)"],
        "data-column": 'NW1S, PSI (LGR S/N: 20705686)',
        "well-index": '1S',
        "dataset": dataset_1S
    }

    info_1L = {
        "non-columns": ["index", "NW1S, PSI (LGR S/N: 20705686)", "NW2L, PSI (LGR S/N: 20705686)",
                        "NW2S, PSI (LGR S/N: 20705686)"],
        "data-column": 'NW1L, PSI (LGR S/N: 20705686)',
        "well-index": '1L',
        "dataset": dataset_1L
    }

    info_2S = {
        "non-columns": ["index", "NW1S, PSI (LGR S/N: 20705686)", "NW1L, PSI (LGR S/N: 20705686)",
                        "NW2L, PSI (LGR S/N: 20705686)"],
        "data-column": 'NW2S, PSI (LGR S/N: 20705686)',
        "well-index": '2S',
        "dataset": dataset_2S
    }

    info_2L = {
        "non-columns": ["index", "NW1S, PSI (LGR S/N: 20705686)", "NW1L, PSI (LGR S/N: 20705686)",
                        "NW2S, PSI (LGR S/N: 20705686)"],
        "data-column": 'NW2L, PSI (LGR S/N: 20705686)',
        "well-index": '2L',
        "dataset": dataset_2L
    }

    dataset = split_join(dataset, [info_1S, info_1L, info_2S, info_2L])
    dataset.drop(axis=1, columns=[id_col], inplace=True, errors="ignore")
    dataset.insert(0, id_col, [x for x in range(dataset.shape[0])], True)

    return dataset.drop(axis=1, columns="level_0")


def get_all_data():
    # get the list of data files and load each into a dataframe, put the dataframes in a list called files.
    alldata = pandas.read_csv("input/files.csv")
    files = []
    i = 0
    for filepath in alldata["File Path"]:
        i += 1
        try:
            files.append((pandas.read_csv(filepath)))
        except UnicodeDecodeError:
            os.write(2, bytearray(f"Failed {i} - {filepath}\n", encoding="UTF-8", errors="e"))
            raise
    print(f"{len(files)} of {alldata.shape[0]} loaded")
    # for each dataframe in files, standardize the column names and remove columns with too few values
    truth = 0
    total = 0
    cut_off = 0.4
    temp = []
    for file, name in zip(files, alldata["Name"]):
        for col in file.columns:
            total += 1
            result = column_matcher(col)
            if not result:
                file.drop(axis=1, columns=[col], inplace=True)
            else:
                file.rename(columns={col: result}, inplace=True)
                if file[result].isna().sum() / file.shape[0] > cut_off:
                    truth += 1
                    file.drop(axis=1, columns=[result], inplace=True)
        file.dropna(axis=0, how="any", inplace=True)
        temp.append(file)
    files = temp
    print(f"{truth}/{total} columns dropped due to insufficient data at {cut_off * 100:.0f}% cut-off")
    temp = dict()
    # restructure data (extract and correct time, remove unnecessary column and add new once)
    # remove null rows, drop first row in case of gibberish headers and group dataframes by day in dict
    for index, dat, name, path in zip(range(len(files)), files, alldata["Name"], alldata["File Path"]):
        try:
            count = 0
            duration = []
            times = []
            dates = []
            dat.dropna(axis=0, how="any", inplace=True)
            dat.drop(0, axis=0, inplace=True, errors="ignore")
            dat.reset_index(inplace=True)
            print(f"•{index + 1:^4}•", end="  ")
            dat = restructure(dat, count, duration, times, dates)

            dat.drop(axis=1, columns=date_time_col, inplace=True, errors="ignore")
            print(f"{str(sorted([x for x in dat.columns])):<123}  •")
            # dat.to_csv(f"output/temp/{name}", index=False)

            if file_matcher(name):
                # flowstation file
                key = re.split("(?<=\\d{2})(-|_|\\s)(?=flow.*)", string=name.lower(), maxsplit=2)[0][:8]
                try_key(temp, key)

                temp[key][flow_key] = dat

            else:
                # wellhead file
                key = re.split("(?<=\\d{2})(\\s-|-|_|\\s)?(?=(t|\\d[ls]).*)", string=name.lower(), maxsplit=2)[0][:8]
                try_key(temp, key)

                try:
                    temp[key][well_key]
                except KeyError:
                    temp[key][well_key] = []

                temp[key][well_key].append((file_matcher2(name.lower()), dat))

        except KeyError:
            print(f"\n\n{path}", flush=True)
            print("Columns:", dat.columns, end="\n\n", flush=True)

    return temp


def get_conversion_factors():
    # get conversion_factors.csv from input in a dataframe
    return pandas.read_csv("input/conversion_factors.csv", encoding="UTF-8")
    # return dataframe
    pass


def offset_wells(agg_data, how=None):
    # Prepare aggregated data for each well from all days
    if how is None:
        how = [0, 0, 0, 0]

    data_1S = []
    data_2S = []
    data_1L = []
    data_2L = []
    for key in agg_data.keys():
        try:
            temp = spj(agg_data[key][flow_key], agg_data[key][well_key], how)  # returns list of (well id, data) tuples
            for t in temp:

                if t[0] == s1:
                    data_1S.append(t[1])
                elif t[0] == s2:
                    data_2S.append(t[1])
                elif t[0] == l1:
                    data_1L.append(t[1])
                elif t[0] == l2:
                    data_2L.append(t[1])
        except KeyError:
            pass
    data_1L = pandas.concat(data_1L, ignore_index=True)
    data_2L = pandas.concat(data_2L, ignore_index=True)
    data_1S = pandas.concat(data_1S, ignore_index=True)
    data_2S = pandas.concat(data_2S, ignore_index=True)

    data_s = []
    for name, data in zip([l1, l2, s1, s2], [data_1L, data_2L, data_1S, data_2S]):
        data.drop_duplicates(subset=[dur_col, date_col], inplace=True)
        data_s.append(data)
        # data.to_csv(f"output/{name}.csv", index=False)

    return pandas.concat(data_s, ignore_index=True)