#!/usr/bin/python
# coding: utf-8

# Author: LE YUAN
# Date: 2020-08-08

# This python script is to obtain protein sequence for each Kcat entries

import os
import re
import json
import requests
import time
from urllib import request
from zeep import Client
import hashlib
# import string
# import hashlib
# from SOAPpy import WSDL
# from SOAPpy import SOAPProxy ## for usage without WSDL file


# This function is to obtain the protein sequence according to the protein id from Uniprot API
# https://www.uniprot.org/uniprot/A0A1D8PIP5.fasta
# https://www.uniprot.org/help/api_idmapping
def uniprot_sequence(id) :
    url = "https://www.uniprot.org/uniprot/%s.fasta" % id
    IdSeq = dict()

    try :
        data = request.urlopen(url)
        respdata = data.read().decode("utf-8").strip()
        IdSeq[id] =  "".join(respdata.split("\n")[1:])
    except :
        print(id, "can not find from uniprot!")
        IdSeq[id] = None
    print(IdSeq[id])
    return IdSeq[id]
    
def uniprotID_entry() :
    # uniprot_sequence('P18314')
    with open("../../Data/database/Kcat_combination_0731.tsv", "r", encoding='utf-8') as file :
        combination_lines = file.readlines()[1:]

    uniprotID_list = list()
    uniprotID_seq = dict()
    uniprotID_noseq = list()

    i=0
    for line in combination_lines :
        data = line.strip().split('\t')
        uniprotID = data[5]

        if uniprotID :
        #     seq = uniprot_sequence('P49384')
            if ' ' in uniprotID :
                # i += 1  # 561
                # print(i)
                # print(uniprotID.split(' '))
                uniprotID_list += uniprotID.split(' ')
            else :
                # print(uniprotID)
                uniprotID_list.append(uniprotID)

    # print(len(uniprotID_list))    # 14045
    uniprotID_unique = list(set(uniprotID_list))
    # print(len(uniprotID_unique)) # 1776
    # print(uniprotID_unique[-6:])

    for uniprotID in uniprotID_unique :
        i += 1
        print(i)
        sequence = uniprot_sequence(uniprotID)
        if sequence :
            uniprotID_seq[uniprotID] = sequence
        else :
            uniprotID_noseq.append(uniprotID)


    print(len(uniprotID_seq))  # 1755
    print(len(uniprotID_noseq))  # 21
    print(uniprotID_noseq)
    # ['P0A5R0', 'P0C5C1', 'P51698', 'P96807', 'Q01745', 'P00892', 'D0B556', 'V5MWQ6', 'Q02469', 'P96223', 'P0A4Z2', 
    # 'P0A4X4', 'P96420', 'Q47741', 'O05783', 'A3S939', 'P0A4X6', 'P56967', 'O60344', 'P04804', 'O52310']
    # check one by one

    with open('../../Data/database/uniprotID_entry.json', 'w') as outfile :
        json.dump(uniprotID_seq, outfile, indent=4)

def uniprotID_noseq() :
    with open('../../Data/database/uniprotID_entry.json', 'r') as infile :
        uniprotID_seq = json.load(infile)

    print(len(uniprotID_seq))
    # uniprotID_noseq = ['P0A5R0', 'P0C5C1', 'P51698', 'P96807', 'Q01745', 'P00892', 'D0B556', 'V5MWQ6', 'Q02469', 'P96223', 'P0A4Z2', 
    # 'P0A4X4', 'P96420', 'Q47741', 'O05783', 'A3S939', 'P0A4X6', 'P56967', 'O60344', 'P04804', 'O52310']

    uniprotID_noseq = {'P0A5R0':'P9WIL4', 'P0C5C1':'P9WKD2', 'P51698':'A0A1L5BTC1', 'P96807':'P9WNP2', 'Q01745':'I1S2N3', 'P00892':'P0DP89', 
    'Q02469':'P0C278', 'P96223':'P9WNF8', 'P0A4Z2':'P9WPY2', 'P0A4X4':'P9WQ86', 'P96420':'P9WQB2', 'Q47741':'F2MMN9', 'O05783':'P9WIQ2', 
    'P0A4X6':'P9WQ80', 'P56967':'F2MMP0', 'O60344':'P0DPD6', 'P04804':'P60906', 'O52310':'P0CL72'}
    # 'D0B556', 'A3S939', 'V5MWQ6'  On April 1, 2015 this entry was made redundant.

    for uniprotID, mappedID in uniprotID_noseq.items() :
        sequence = uniprot_sequence(mappedID)
        print(uniprotID)
        print(sequence)
        if sequence :
            uniprotID_seq[uniprotID] = sequence
        else :
            print('No sequence found!---------------------------')

    print(len(uniprotID_seq))  # 1773   'D0B556', 'A3S939', 'V5MWQ6' no sequence found!

    with open('../../Data/database/uniprotID_entry_all.json', 'w') as outfile :
        json.dump(uniprotID_seq, outfile, indent=4)

# You can try to retrieve sequences from uniprot using rest interface.
# Example: (ec: 1.1.1.1 , organisms: Homo sapiens)
# http://www.uniprot.org/uniprot/?query=ec:1.1.1.1+AND+organism:"Homo sapiens"&format=fasta
# full information abut syntax you can find here: http://www.uniprot.org/help/programmatic_access
def seq_by_ec_organism(ec, organism) :
    IdSeq = dict()
    # https://www.biostars.org/p/356687/
    params = {"query": "ec:%s AND organism:%s AND reviewed:yes" % (ec, organism), "format": "fasta"}
    response = requests.get("http://www.uniprot.org/uniprot/", params=params)
    # print(type(response.text)) # <class 'str'>

    try :
        # respdata = response.text.strip()
        # # print(respdata) 
        # IdSeq[ec+'&'+organism] =  "".join(respdata.split("\n")[1:])

        respdata = response.text
        # print(respdata)
        sequence = list()
        seq = dict()
        i = 0
        for line in respdata.split('\n') :
            if line.startswith('>') :
                name=line
                seq[name] = ''
            else :
                seq[name] += line.replace('\n', '').strip()
        IdSeq[ec+'&'+organism] =  list(seq.values())

    except :
        print(ec+'&'+organism, "can not find from uniprot!")
        IdSeq[ec+'&'+organism] = None

    print(IdSeq[ec+'&'+organism])
    return IdSeq[ec+'&'+organism]

# Run in python 2.7
def seq_by_brenda(ec, organism) :
    # # E-mail in BRENDA:
    # email = 'leyu@chalmers.se'
    # # Password in BRENDA:
    # password = 'yuanle13579'

    # endpointURL = "https://www.brenda-enzymes.org/soap/brenda_server.php"
    # client      = SOAPProxy(endpointURL)
    # password    = hashlib.sha256(password).hexdigest()
    # credentials = email + ',' + password

    # parameters = credentials+","+"ecNumber*%s#organism*%s" %(ec, organism)
    # content = client.getSequence(parameters)

    # # E-mail in BRENDA:
    # email = 'leyu@chalmers.se'
    # # Password in BRENDA:
    # password = 'yuanle13579'

    # wsdl = "https://www.brenda-enzymes.org/soap/brenda.wsdl"
    # client      = WSDL.Proxy(wsdl)
    # password    = hashlib.sha256(password).hexdigest()
    # credentials = email + ',' + password

    # parameters = credentials+","+"ecNumber*%s#organism*%s" %(ec, organism)
    # content = client.getSequence(parameters)

    # split_sequences = content.strip().split('!') #noOfAminoAcids #!
    # # UniProtKB/TrEMBL is a computer-annotated protein sequence database complementing the UniProtKB/Swiss-Prot Protein Knowledgebase.
    # sequences = list()
    # # print(split_sequences)
    # for sequence in split_sequences :
    #     dict_entry = dict()
    #     # print(sequence)
    #     list_one = sequence.split('#')
    #     # print(list_one)
    #     for one in list_one[:-1] :
    #         # print(one)
    #         dict_entry[one.split('*')[0]] = one.split('*')[1]
    #     # try :
    #     #     if dict_entry['source'] == 'Swiss-Prot' :
    #     #         sequences.append(dict_entry['sequence'])
    #     #     else :
    #     #         continue
    #     # except :
    #     #     sequences = None
    #     try :
    #         sequences.append(dict_entry['sequence'])
    #     except :
    #         sequences = None

    # print(sequences)

    #New method using Python 3 because using Python 2 method provided by BRENDA could just run less than 10 hits as above
    # E-mail in BRENDA:
    email = 'youremail'
    # Password in BRENDA:
    password = 'yourpassword'

    wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
    password    = hashlib.sha256(password.encode("utf-8")).hexdigest()
    client = Client(wsdl)
    # credentials = email + ',' + password

    # parameters = credentials+","+"ecNumber*%s#organism*%s" %(ec, organism)
    parameters = ( email,password,"ecNumber*%s" % ec,"organism*%s" % organism, "sequence*", "noOfAminoAcids*", "firstAccessionCode*", "source*Swiss-Prot", "id*" ) # *Swiss-Prot
    entries = client.service.getSequence(*parameters)

    # print(entries)

    sequences = list()
    # print(split_sequences)
    if entries :
        for entry in entries :
            sequences.append(entry['sequence'])

    print(sequences)
    print(len(sequences))
    return sequences

def nouniprotID_entry_uniprot() :
    # ec = '1.1.1.206'
    # organism = 'Datura stramonium'
    # seq_by_ec_organism(ec, organism)

    with open("../../Data/database/Kcat_combination_0731.tsv", "r", encoding='utf-8') as file :
        combination_lines = file.readlines()[1:]

    IdSeq = dict()
    entries = list()
    i=0
    for line in combination_lines :
        data = line.strip().split('\t')
        ec = data[0]
        organism = data[2]
        uniprotID = data[5]

        if not uniprotID :
            entries.append((ec,organism))

    # print(len(entries))  # 28104
    entries_unique = set(entries)
    # print(len(entries_unique)) # 7258

    for entry in list(entries_unique) :
        # print(entry)
        ec, organism = entry[0], entry[1]
        i += 1
        print('This is', str(i)+'------------')
        IdSeq[ec+'&'+organism] = seq_by_ec_organism(ec, organism)
    # print(len(IdSeq)
        if i%10 == 0 :
            time.sleep(3)

    with open('../../Data/database/nouniprotID_entry_all.json', 'w') as outfile :
        json.dump(IdSeq, outfile, indent=4)

# Run in python 2.7
def nouniprotID_entry_brenda() :
    with open("../../Data/database/Kcat_combination_0731.tsv", "r") as file :
        combination_lines = file.readlines()[1:]

    IdSeq = dict()
    entries = list()
    i=0
    for line in combination_lines :
        data = line.strip().split('\t')
        ec = data[0]
        organism = data[2]
        uniprotID = data[5]

        if not uniprotID :
            entries.append((ec,organism))

    # print(len(entries))  # 28104
    entries_unique = set(entries)
    # print(len(entries_unique)) # 7258

    for entry in list(entries_unique) :
        # print(entry)
        ec, organism = entry[0], entry[1]
        i += 1
        print('This is', str(i)+'------------')
        # print(ec)
        # print(organism)
        IdSeq[ec+'&'+organism] = seq_by_brenda(ec,organism)

    with open('../../Data/database/nouniprotID_entry_brenda.json', 'w') as outfile :
        json.dump(IdSeq, outfile, indent=4)

def combine_sequence() :
    with open('../../Data/database/uniprotID_entry_all.json', 'r') as file1:
        uniprot_file1 = json.load(file1)

    with open('../../Data/database/nouniprotID_entry_all.json', 'r') as file2:  # By Uniprot API
        nouniprot_file2 = json.load(file2)

    with open('../../Data/database/nouniprotID_entry_brenda.json', 'r') as file3:  # By BRENDA API
        nouniprot_file3 = json.load(file3)

    with open("../../Data/database/Kcat_combination_0731.tsv", "r", encoding='utf-8') as file4 :
        Kcat_lines = file4.readlines()[1:]

    # i = 0
    # for proteinKey, sequence in nouniprot_file2.items() :
    #     if sequence :
    #         if len(sequence) == 1 :  # 1178 BRENDA  1919 Uniprot
    #         # if sequence :  # 1784 BRENDA  3363 Uniprot
    #             i += 1   
    #             print(i)
    # print(len(nouniprot_file3))

    i = 0
    j = 0
    n = 0
    entries = list()
    for line in Kcat_lines :
        data = line.strip().split('\t')
        ECNumber, EnzymeType, Organism, Smiles = data[0], data[1], data[2], data[3]
        Substrate, UniprotID, Value, Unit = data[4], data[5], data[6], data[7]

        RetrievedSeq = ''
        entry = dict()
        # print(UniprotID)
        if UniprotID :
            # print(UniprotID)
            try :  # because a few (maybe four) UniprotIDs have no ID as the key 
                if ' ' not in UniprotID :
                    RetrievedSeq = [uniprot_file1[UniprotID]]
                    # print(RetrievedSeq)
                else :
                    # print(UniprotID)
                    RetrievedSeq1 = [uniprot_file1[UniprotID.split(' ')[0]]]
                    RetrievedSeq2 = [uniprot_file1[UniprotID.split(' ')[1]]]
                    if RetrievedSeq1 == RetrievedSeq2 :
                        RetrievedSeq = RetrievedSeq1
                    # if len(RetrievedSeq) == 1:
                    #     print(RetrievedSeq)
            except :
                continue

        else :
            if nouniprot_file2[ECNumber+'&'+Organism] :
                # print(nouniprot_file2[ECNumber+'&'+Organism])
                if len(nouniprot_file2[ECNumber+'&'+Organism]) == 1 :
                    RetrievedSeq = nouniprot_file2[ECNumber+'&'+Organism]
                    # print(RetrievedSeq)
                else :
                    RetrievedSeq = ''

        # print(RetrievedSeq)
        try:  # local variable 'RetrievedSeq' referenced before assignment
            if len(RetrievedSeq) == 1 and EnzymeType == 'wildtype':  # 21108 for all, 9529 wildtype, 11579 mutant (EnzymeType != 'wildtype')
                sequence = RetrievedSeq
                i += 1
                # print(str(i) + '---------------------------')
                # print(ECNumber)
                # print(Organism)
                # print(sequence)

                entry = {
                    'ECNumber': ECNumber,
                    'Organism': Organism,
                    'Smiles': Smiles,
                    'Substrate': Substrate,
                    'Sequence': sequence[0],
                    'Type': 'wildtype',
                    'Value': Value,
                    'Unit': Unit,
                }

                entries.append(entry)

            if len(RetrievedSeq) == 1 and EnzymeType != 'wildtype':
                sequence = RetrievedSeq[0]

                mutantSites = EnzymeType.split('/')
                # print(mutantSites)

                mutant1_1 = [mutantSite[1:-1] for mutantSite in mutantSites]
                mutant1_2 = [mutantSite for mutantSite in mutantSites]
                mutant1 = [mutant1_1, mutant1_2]
                mutant2 = set(mutant1[0])
                if len(mutant1[0]) != len(mutant2) :
                    print(mutant1)
                    n += 1
                    print(str(n) + '---------------------------')  # some are mapped, some are not mapped. R234G/R234K (60, 43 mapped, 17 not mapped)

                mutatedSeq = sequence
                for mutantSite in mutantSites :
                    # print(mutantSite)
                    # print(mutatedSeq[int(mutantSite[1:-1])-1])
                    # print(mutantSite[0])
                    # print(mutantSite[-1])
                    if mutatedSeq[int(mutantSite[1:-1])-1] == mutantSite[0] :
                        # pass
                        mutatedSeq = list(mutatedSeq)
                        mutatedSeq[int(mutantSite[1:-1])-1] = mutantSite[-1]
                        mutatedSeq = ''.join(mutatedSeq)
                        if not mutatedSeq :
                            print('-------------')
                    else :
                        # n += 1
                        # print(str(n) + '---------------------------')
                        mutatedSeq = ''

                if mutatedSeq :
                    # j += 1
                    # print(str(j) + '---------------------------')          
                    entry = {
                        'ECNumber': ECNumber,
                        'Organism': Organism,
                        'Smiles': Smiles,
                        'Substrate': Substrate,
                        'Sequence': mutatedSeq,
                        'Type': 'mutant',
                        'Value': Value,
                        'Unit': Unit,
                    }

                    entries.append(entry)


        #     if len(RetrievedSeq) == 1 :  # 21108 for all, 9529 wildtype, 11579 mutant (EnzymeType != 'wildtype')
        #         sequence = RetrievedSeq
        #         # i += 1
        #         # print(str(i) + '---------------------------')
        #         # print(ECNumber)
        #         # print(Organism)
        #         # print(sequence)

        #         entry = {
        #             'ECNumber': ECNumber,
        #             'Organism': Organism,
        #             'Smiles': Smiles,
        #             'Substrate': Substrate,
        #             'Sequence': sequence[0],
        #             'Value': Value,
        #             'Unit': Unit,
        #         }

        #         entries.append(entry)

        except:
            continue

    # mutatedSeq.replace([int(mutantSite[1:-1])-1], mutantSite[-1])
    print(i)

    print(len(entries))   # 17010  including 9529 wildtype and 7481 mutant

    # with open('../../Data/database/Kcat_combination_0918.json', 'w') as outfile :
    #     json.dump(entries, outfile, indent=4)
    with open('../../Data/database/Kcat_combination_0918_wildtype_mutant.json', 'w') as outfile :
        json.dump(entries, outfile, indent=4)

def check_substrate_seq() :
    with open('../../Data/database/Kcat_combination_0918.json', 'r') as file :
        datasets = json.load(file)

    substrate = [data['Substrate'].lower() for data in datasets]
    sequence = [data['Sequence'] for data in datasets]
    organism = [data['Organism'].lower() for data in datasets]
    EC_number = [data['ECNumber'] for data in datasets]

    unique_substrate = len(set(substrate))
    unique_sequence = len(set(sequence))
    unique_organism = len(set(organism))
    unique_EC_number = len(set(EC_number))

    print('The number of unique substrate:', unique_substrate)
    print('The number of unique sequence:', unique_sequence)
    print('The number of unique organism:', unique_organism)
    print('The number of unique EC Number:', unique_EC_number)

    # The number of unique substrate: 2706
    # The number of unique sequence: 7857
    # The number of unique organism: 856
    # The number of unique EC Number: 1706

if __name__ == "__main__" :
    combine_sequence()
    check_substrate_seq()