File size: 2,100 Bytes
70b95b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python
# coding: utf-8

# Author: LE YUAN
# Date: 2020-07-24


import os
import re
import csv


# with open("../../Data/database/Kcat_sabio_clean_unisubstrate.tsv", "r", encoding='utf-8') as file :
#     lines = file.readlines()[1:]

# enzymeTypes = [line.strip().split('\t')[3] for line in lines]

# print(len(enzymeTypes)) # 18243

# enzymeType_entries = list()
# for desc in enzymeTypes :
#     if 'wildtype' in desc :
#         enzymeType = 'wildtype'
#     else :
#     # if 'mutant' in desc or 'mutated' in desc:
#         print(desc)
#         mutant = re.findall('[A-Z]\d+[A-Z]', desc)  # re is of great use
#         if len(mutant) >=1 :
#             enzymeType = '/'.join(mutant)

#     if enzymeType :
#         enzymeType_entries.append(enzymeType)

# # print(enzymeType_entries)
# print(len(enzymeType_entries))  



with open("../../Data/database/Kcat_sabio_clean_unisubstrate.tsv", "r", encoding='utf-8') as file :
    lines = file.readlines()[1:]

clean_mutant = list()
for line in lines :
    # print(line)
    data = line.strip().split('\t')
    Type = data[0]
    ECNumber = data[1]
    Substrate = data[2]
    EnzymeType = data[3]
    PubMedID = data[4]
    Organism =data[5]
    UniprotID = data[6]
    Value = data[7]
    Unit = data[8]

    if 'wildtype' in EnzymeType :
        enzymeType = 'wildtype'
    else :
    # if 'mutant' in EnzymeType or 'mutated' in EnzymeType:
        print(EnzymeType)
        mutant = re.findall('[A-Z]\d+[A-Z]', EnzymeType)  # re is of great use
        enzymeType = '/'.join(mutant)

    print(enzymeType)
    if enzymeType :
        clean_mutant.append([Type, ECNumber, Substrate, enzymeType, PubMedID, Organism, UniprotID, Value, Unit])


# print(enzymeType_entries)
print(len(clean_mutant))  # 17384


with open("../../Data/database/Kcat_sabio_clean_unisubstrate_2.tsv", "w") as outfile :
    records = ['Type', 'ECNumber', 'Substrate', 'EnzymeType', 'PubMedID', 'Organism', 'UniprotID', 'Value', 'Unit']
    outfile.write('\t'.join(records) + '\n')
    for line in clean_mutant :
        outfile.write('\t'.join(line) + '\n')