File size: 2,100 Bytes
70b95b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
#!/usr/bin/python
# coding: utf-8
# Author: LE YUAN
# Date: 2020-07-24
import os
import re
import csv
# with open("../../Data/database/Kcat_sabio_clean_unisubstrate.tsv", "r", encoding='utf-8') as file :
# lines = file.readlines()[1:]
# enzymeTypes = [line.strip().split('\t')[3] for line in lines]
# print(len(enzymeTypes)) # 18243
# enzymeType_entries = list()
# for desc in enzymeTypes :
# if 'wildtype' in desc :
# enzymeType = 'wildtype'
# else :
# # if 'mutant' in desc or 'mutated' in desc:
# print(desc)
# mutant = re.findall('[A-Z]\d+[A-Z]', desc) # re is of great use
# if len(mutant) >=1 :
# enzymeType = '/'.join(mutant)
# if enzymeType :
# enzymeType_entries.append(enzymeType)
# # print(enzymeType_entries)
# print(len(enzymeType_entries))
with open("../../Data/database/Kcat_sabio_clean_unisubstrate.tsv", "r", encoding='utf-8') as file :
lines = file.readlines()[1:]
clean_mutant = list()
for line in lines :
# print(line)
data = line.strip().split('\t')
Type = data[0]
ECNumber = data[1]
Substrate = data[2]
EnzymeType = data[3]
PubMedID = data[4]
Organism =data[5]
UniprotID = data[6]
Value = data[7]
Unit = data[8]
if 'wildtype' in EnzymeType :
enzymeType = 'wildtype'
else :
# if 'mutant' in EnzymeType or 'mutated' in EnzymeType:
print(EnzymeType)
mutant = re.findall('[A-Z]\d+[A-Z]', EnzymeType) # re is of great use
enzymeType = '/'.join(mutant)
print(enzymeType)
if enzymeType :
clean_mutant.append([Type, ECNumber, Substrate, enzymeType, PubMedID, Organism, UniprotID, Value, Unit])
# print(enzymeType_entries)
print(len(clean_mutant)) # 17384
with open("../../Data/database/Kcat_sabio_clean_unisubstrate_2.tsv", "w") as outfile :
records = ['Type', 'ECNumber', 'Substrate', 'EnzymeType', 'PubMedID', 'Organism', 'UniprotID', 'Value', 'Unit']
outfile.write('\t'.join(records) + '\n')
for line in clean_mutant :
outfile.write('\t'.join(line) + '\n')
|