koke commited on
Commit
124bf1a
1 Parent(s): 7d8e80b
.gitattributes CHANGED
@@ -26,3 +26,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
  *.zip filter=lfs diff=lfs merge=lfs -text
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
26
  *.zip filter=lfs diff=lfs merge=lfs -text
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
29
+ *.json filter=lfs diff=lfs merge=lfs -text
30
+ *.hdf5 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ env/
2
+ .vscode/
3
+ __pycache__/
app.py CHANGED
@@ -1,4 +1,45 @@
1
  import streamlit as st
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from predict import predict_seq
3
+ from Bio import SeqIO
4
+ from io import StringIO
5
+ from src.utils import array2img
6
+ from src.fcgr import FCGR
7
 
8
+ with st.sidebar:
9
+ st.write("Options")
10
+ kmer = st.slider(label="kmer to visualize FCGR",
11
+ min_value=6,
12
+ max_value=9,
13
+ value=8
14
+ )
15
+
16
+ # App
17
+ st.title('Sars-cov-2 classification with FCGR')
18
+
19
+ # load fasta file
20
+ uploaded_file = st.file_uploader(label="Load fasta file")
21
+
22
+ if uploaded_file is not None:
23
+ stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
24
+ record = SeqIO.parse(stringio , "fasta")
25
+ fasta = next(record)
26
+
27
+ with st.spinner("Inference..."):
28
+ prediction, confidence, fcgr = predict_seq(str(fasta.seq), return_fcgr=True)
29
+ st.success("Done!")
30
+
31
+ st.write("### Results ")
32
+ st.write("Prediction: ", prediction)
33
+ st.write("Confidence: ", confidence)
34
+
35
+ # To generate the image to show
36
+ with st.spinner("Plotting FCGR"):
37
+ gen_fcgr = FCGR(kmer)
38
+ fcgr = gen_fcgr(fasta.seq)
39
+ img = array2img(fcgr)
40
+ # Show FCGR
41
+ st.image(
42
+ image=img,
43
+ caption="FCGR",
44
+ use_column_width="auto",
45
+ width=20)
predict.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from src.fcgr import FCGR
4
+ from src.model_loader import ModelLoader
5
+ from src.preprocessing import Pipeline
6
+ from src.utils import clean_seq
7
+ fcgr = FCGR(k=6)
8
+ loader = ModelLoader()
9
+ order_output = ['S','L','G','V','GR','GH','GV','GK','GRY','O','GRA']
10
+ model = loader("resnet50_6mers", 11, "trained-models/model-34-0.954.hdf5")
11
+ with open("trained-models/preprocessing.json") as fp:
12
+ pipe = json.load(fp)
13
+ preprocessing = Pipeline(pipe)
14
+
15
+
16
+ def predict_seq(seq, return_fcgr=False):
17
+ array = fcgr(clean_seq(seq))
18
+ array = preprocessing(array)
19
+ pred = model.predict(np.expand_dims(np.expand_dims(array,axis=0),axis=-1))[0]
20
+ argmax = pred.argmax()
21
+ confidence = pred[argmax]
22
+
23
+ if return_fcgr:
24
+ return order_output[argmax], confidence, array
25
+ return order_output[argmax], confidence
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ tensorflow==2.7
3
+ biopython
4
+ numpy
5
+ Pillow
src/cgr.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "From original work: CGR for gene structure"
2
+ from typing import Dict, Optional
3
+ from collections import namedtuple
4
+
5
+ # coordinates for x+iy
6
+ Coord = namedtuple("Coord", ["x","y"])
7
+
8
+ # coordinates for a CGR encoding
9
+ CGRCoords = namedtuple("CGRCoords", ["N","x","y"])
10
+
11
+ # coordinates for each nucleotide in the 2d-plane
12
+ DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))
13
+
14
+ class CGR:
15
+ "Chaos Game Representation for DNA"
16
+ def __init__(self, coords: Optional[Dict[chr,tuple]]=None):
17
+ self.nucleotide_coords = DEFAULT_COORDS if coords is None else coords
18
+ self.cgr_coords = CGRCoords(0,0,0)
19
+
20
+ def nucleotide_by_coords(self,x,y):
21
+ "Get nucleotide by coordinates (x,y)"
22
+ # filter nucleotide by coordinates
23
+ filtered = dict(filter(lambda item: item[1] == Coord(x,y), self.nucleotide_coords.items()))
24
+
25
+ return list(filtered.keys())[0]
26
+
27
+ def forward(self, nucleotide: str):
28
+ "Compute next CGR coordinates"
29
+ x = (self.cgr_coords.x + self.nucleotide_coords.get(nucleotide).x)/2
30
+ y = (self.cgr_coords.y + self.nucleotide_coords.get(nucleotide).y)/2
31
+
32
+ # update cgr_coords
33
+ self.cgr_coords = CGRCoords(self.cgr_coords.N+1,x,y)
34
+
35
+ def backward(self,):
36
+ "Compute last CGR coordinates. Current nucleotide can be inferred from (x,y)"
37
+ # get current nucleotide based on coordinates
38
+ n_x,n_y = self.coords_current_nucleotide()
39
+ nucleotide = self.nucleotide_by_coords(n_x,n_y)
40
+
41
+ # update coordinates to the previous one
42
+ x = 2*self.cgr_coords.x - n_x
43
+ y = 2*self.cgr_coords.y - n_y
44
+
45
+ # update cgr_coords
46
+ self.cgr_coords = CGRCoords(self.cgr_coords.N-1,x,y)
47
+
48
+ return nucleotide
49
+
50
+ def coords_current_nucleotide(self,):
51
+ x = 1 if self.cgr_coords.x>0 else -1
52
+ y = 1 if self.cgr_coords.y>0 else -1
53
+ return x,y
54
+
55
+ def encode(self, sequence: str):
56
+ "From DNA sequence to CGR"
57
+ # reset starting position to (0,0,0)
58
+ self.reset_coords()
59
+ for nucleotide in sequence:
60
+ self.forward(nucleotide)
61
+ return self.cgr_coords
62
+
63
+ def reset_coords(self,):
64
+ self.cgr_coords = CGRCoords(0,0,0)
65
+
66
+ def decode(self, N:int, x:int, y:int)->str:
67
+ "From CGR to DNA sequence"
68
+ self.cgr_coords = CGRCoords(N,x,y)
69
+
70
+ # decoded sequence
71
+ sequence = []
72
+
73
+ # Recover the entire genome
74
+ while self.cgr_coords.N>0:
75
+ nucleotide = self.backward()
76
+ sequence.append(nucleotide)
77
+ return "".join(sequence[::-1])
src/fcgr.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .cgr import CGR
2
+ from itertools import product
3
+ from collections import defaultdict
4
+ import numpy as np
5
+
6
+ class FCGR(CGR):
7
+ """Frequency matrix CGR
8
+ an (2**k x 2**k) 2D representation will be created for a
9
+ n-long sequence.
10
+ - k represents the k-mer.
11
+ - 2**k x 2**k = 4**k the total number of k-mers (sequences of length k)
12
+ - pixel value correspond to the value of the frequency for each k-mer
13
+ """
14
+
15
+ def __init__(self, k: int,):
16
+ super().__init__()
17
+ self.k = k # k-mer representation
18
+ self.kmers = list("".join(kmer) for kmer in product("ACGT", repeat=self.k))
19
+ self.kmer2pixel = self.kmer2pixel_position()
20
+
21
+ def __call__(self, sequence: str):
22
+ "Given a DNA sequence, returns an array with his frequencies in the same order as FCGR"
23
+ self.count_kmers(sequence)
24
+
25
+ # Create an empty array to save the FCGR values
26
+ array_size = int(2**self.k)
27
+ freq_matrix = np.zeros((array_size,array_size))
28
+
29
+ # Assign frequency to each box in the matrix
30
+ for kmer, freq in self.freq_kmer.items():
31
+ pos_x, pos_y = self.kmer2pixel[kmer]
32
+ freq_matrix[int(pos_x)-1,int(pos_y)-1] = freq
33
+ return freq_matrix
34
+
35
+ def count_kmer(self, kmer):
36
+ if "N" not in kmer:
37
+ self.freq_kmer[kmer] += 1
38
+
39
+ def count_kmers(self, sequence: str):
40
+ self.freq_kmer = defaultdict(int)
41
+ # representativity of kmers
42
+ last_j = len(sequence) - self.k + 1
43
+ kmers = (sequence[i:(i+self.k)] for i in range(last_j))
44
+ # count kmers in a dictionary
45
+ list(self.count_kmer(kmer) for kmer in kmers)
46
+
47
+ def kmer_probabilities(self, sequence: str):
48
+ self.probabilities = defaultdict(float)
49
+ N=len(sequence)
50
+ for key, value in self.freq_kmer.items():
51
+ self.probabilities[key] = float(value) / (N - self.k + 1)
52
+
53
+ def pixel_position(self, kmer: str):
54
+ "Get pixel position in the FCGR matrix for a k-mer"
55
+
56
+ coords = self.encode(kmer)
57
+ N,x,y = coords.N, coords.x, coords.y
58
+
59
+ # Coordinates from [-1,1]² to [1,2**k]²
60
+ np_coords = np.array([(x + 1)/2, (y + 1)/2]) # move coordinates from [-1,1]² to [0,1]²
61
+ np_coords *= 2**self.k # rescale coordinates from [0,1]² to [0,2**k]²
62
+ x,y = np.ceil(np_coords) # round to upper integer
63
+
64
+ # Turn coordinates (cx,cy) into pixel (px,py) position
65
+ # px = 2**k-cy+1, py = cx
66
+ return 2**self.k-int(y)+1, int(x)
67
+
68
+ def kmer2pixel_position(self,):
69
+ kmer2pixel = dict()
70
+ for kmer in self.kmers:
71
+ kmer2pixel[kmer] = self.pixel_position(kmer)
72
+ return kmer2pixel
src/model_loader.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load model from /models"""
2
+ import importlib
3
+ import os
4
+
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from tensorflow.python.eager.context import num_gpus
9
+
10
+ OMMIT = {".ipynb_checkpoints","__pycache__","__init__","custom_layers","custom_losses"} # files to be ommited
11
+ BASE_DIR = Path(__file__).resolve().parent # base directory unsupervised-dna
12
+ BASE_MODELS = BASE_DIR.joinpath("models") # models directory
13
+
14
+ class ModelLoader:
15
+ "Load models for unsupervised learning using FCGR (grayscale images)"
16
+
17
+ AVAILABLE_MODELS = [model[:-3] for model in os.listdir(BASE_MODELS) if all([ommit not in model for ommit in OMMIT])]
18
+
19
+ def __call__(self, model_name: str, n_outputs: int, weights_path: Optional[Path]=None):
20
+ "Get keras model"
21
+
22
+ # Call class of model to load
23
+ get_model = getattr(
24
+ importlib.import_module(
25
+ f"src.models.{model_name}"
26
+ ),
27
+ "get_model")
28
+
29
+ # Load architecture
30
+ model = get_model(n_outputs)
31
+
32
+ # Load weights to the model from file
33
+ if weights_path is not None:
34
+ print(f"\n **load model weights_path** : {weights_path}")
35
+ model.load_weights(weights_path)
36
+
37
+ print("\n**Model created**")
38
+
39
+ return model
src/models/resnet50_6mers.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/c1ph3rr/Deep-Residual-Learning-for-Image-Recognition/blob/master/Resnet50.py
2
+ from pathlib import Path
3
+ from tensorflow.keras.models import Model
4
+ from tensorflow.keras.layers import (
5
+ Input,
6
+ Conv2D,
7
+ Dense,
8
+ MaxPool2D,
9
+ GlobalAveragePooling2D,
10
+ Add,
11
+ Activation,
12
+ BatchNormalization,
13
+ ZeroPadding2D,
14
+ )
15
+
16
+ # Reference name of model
17
+ MODEL_NAME = str(Path(__file__).resolve().stem)
18
+
19
+ def identity_block(inp, filters, kernel_size, block, layer):
20
+
21
+ f1, f2, f3 = filters
22
+
23
+ conv_name = 'id_conv_b' + block + '_l' + layer
24
+ batch_name = 'id_batch_b' + block + '_l' + layer
25
+
26
+ x = Conv2D(filters=f1, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_a')(inp)
27
+ x = BatchNormalization(name=batch_name + '_a')(x)
28
+ x = Activation('relu')(x)
29
+
30
+ x = Conv2D(filters=f2, kernel_size=kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name + '_b')(x)
31
+ x = BatchNormalization(name=batch_name + '_b')(x)
32
+ x = Activation('relu')(x)
33
+
34
+ x = Conv2D(filters=f3, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_c')(x)
35
+ x = BatchNormalization(name=batch_name + '_c')(x)
36
+
37
+ add = Add()([inp, x])
38
+ x = Activation('relu')(add)
39
+
40
+ return x
41
+
42
+
43
+ def convolutional_block(inp, filters, kernel_size, block, layer, strides=2):
44
+
45
+ f1, f2, f3 = filters
46
+
47
+ conv_name = 'res_conv_b' + block + '_l' + layer
48
+ batch_name = 'res_batch_b' + block + '_l' + layer
49
+
50
+ y = Conv2D(filters=f1, kernel_size=1, padding='same', strides=strides, kernel_initializer='he_normal', name=conv_name + '_a')(inp)
51
+ y = BatchNormalization(name=batch_name + '_a')(y)
52
+ y = Activation('relu')(y)
53
+
54
+ y = Conv2D(filters=f2, kernel_size=kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name + '_b')(y)
55
+ y = BatchNormalization(name=batch_name + '_b')(y)
56
+ y = Activation('relu')(y)
57
+
58
+ y = Conv2D(filters=f3, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_c')(y)
59
+ y = BatchNormalization(name=batch_name + '_c')(y)
60
+
61
+ shortcut = Conv2D(filters=f3, kernel_size=1, strides=strides, kernel_initializer='he_normal', name=conv_name + '_shortcut')(inp)
62
+ shortcut = BatchNormalization(name=batch_name + '_shortcut')(shortcut)
63
+
64
+ add = Add()([shortcut, y])
65
+ y = Activation('relu')(add)
66
+
67
+ return y
68
+
69
+ def get_model(n_outputs):
70
+
71
+ inp = Input(shape=(64, 64, 1), name='input')
72
+ padd = ZeroPadding2D(3)(inp)
73
+
74
+ conv1 = Conv2D(64, 7, strides=2, padding='valid', name='conv1')(padd)
75
+ conv1 = BatchNormalization(name='batch2')(conv1)
76
+ conv1 = Activation('relu')(conv1)
77
+ conv1 = ZeroPadding2D(1)(conv1)
78
+ conv1 = MaxPool2D(3, 2)(conv1)
79
+
80
+ conv2 = convolutional_block(conv1, [64,64,256], 3, '2', '1', strides=1)
81
+ conv2 = identity_block(conv2, [64,64,256], 3, '2', '2')
82
+ conv2 = identity_block(conv2, [64,64,256], 3, '2', '3')
83
+
84
+ conv3 = convolutional_block(conv2, [128,128,512], 3, '3', '1')
85
+ conv3 = identity_block(conv3, [128,128,512], 3, '3', '2')
86
+ conv3 = identity_block(conv3, [128,128,512], 3, '3', '3')
87
+ conv3 = identity_block(conv3, [128,128,512], 3, '3', '4')
88
+
89
+ conv4 = convolutional_block(conv3, [256,256,1024], 3, '4', '1')
90
+ conv4 = identity_block(conv4, [256,256,1024], 3, '4', '2')
91
+ conv4 = identity_block(conv4, [256,256,1024], 3, '4', '3')
92
+ conv4 = identity_block(conv4, [256,256,1024], 3, '4', '4')
93
+ conv4 = identity_block(conv4, [256,256,1024], 3, '4', '5')
94
+ conv4 = identity_block(conv4, [256,256,1024], 3, '4', '6')
95
+
96
+ conv5 = convolutional_block(conv4, [512,512,2048], 3, '5', '1')
97
+ conv5 = identity_block(conv5, [512,512,2048], 3, '5', '2')
98
+ conv5 = identity_block(conv5, [512,512,2048], 3, '5', '3')
99
+
100
+ avg_pool = GlobalAveragePooling2D()(conv5)
101
+ out = Dense(n_outputs, activation='softmax')(avg_pool)
102
+
103
+ return Model(inp, out)
src/pipeline.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ from pathlib import Path
4
+ from collections import OrderedDict
5
+ from typing import List, Tuple, Optional, Union
6
+
7
+ FUNCTIONS_PIPELINE = OrderedDict()
8
+
9
+ def register_in_pipeline(func):
10
+ """Collect functions for the pipeline"""
11
+ print(f"Adding {func.__name__}")
12
+ if func.__name__ not in FUNCTIONS_PIPELINE:
13
+ FUNCTIONS_PIPELINE[func.__name__] = func
14
+ else:
15
+ raise Exception(f"Duplicated function with name {func.__name__}")
16
+
17
+ class Pipeline:
18
+ """Define a sequence of functions to be applied to one input"""
19
+ FUNCTIONS_PIPELINE = FUNCTIONS_PIPELINE
20
+ def __init__(self, pipeline: Optional[List[Tuple[str, dict]]] = None):
21
+ self.pipeline = pipeline if pipeline else []
22
+
23
+ def __call__(self, x):
24
+ """Apply pipeline to the input 'x'"""
25
+ for pipe in self.pipeline:
26
+ func_name, *args, kwargs = pipe
27
+ assert isinstance(kwargs, dict), f"Wrong declaration in {func_name!r}. Must be (str, dict) or (str, tuple, dict)"
28
+ # apply preprocessing
29
+ if args:
30
+ #print("args and kwargs")
31
+ x = self.apply(x, func_name, *args, **kwargs)
32
+ else:
33
+ #print("only kwargs")
34
+ x = self.apply(x, func_name, **kwargs)
35
+ return x
36
+
37
+ @classmethod
38
+ def apply(cls, x, func, *args, **kwargs):
39
+ """Compute func(x, *args, **kwargs)"""
40
+ if func in cls.FUNCTIONS_PIPELINE:
41
+ return cls.FUNCTIONS_PIPELINE[func](x, *args, **kwargs)
42
+ else:
43
+ raise TypeError(f"{func} not available")
44
+
45
+ def __gt__(self, add_pipe: Union[List,Tuple]):
46
+ """Add a pipe ("func_name", args, kwargs) or ("func_name", kwargs) to the current pipeline"""
47
+ if self.is_available(add_pipe[0]):
48
+ self.pipeline.append(add_pipe)
49
+ return self
50
+ else:
51
+ raise NotImplementedError(f"{add_pipe[0]!r} not available in Pipeline")
52
+
53
+ def is_available(self, func_name: str):
54
+ """Return True if the function 'func_name' is available in Pipeline"""
55
+ return True if func_name in self.FUNCTIONS_PIPELINE else False
56
+
57
+ def asJSON(self, path_save: str =None):
58
+ """Save pipeline configuration as json file"""
59
+ path_save = Path(path_save) if path_save else Path("pipeline.json")
60
+ with open(path_save, "w", encoding="utf8") as fp:
61
+ json.dump(self.pipeline, fp, indent=4, ensure_ascii=False)
62
+ print(f"Pipeline configuration saved at {path_save!r}")
63
+
64
+ def fromJSON(self, path_pipeline: str):
65
+ """Load pipeline configuration from json file"""
66
+ path_pipeline = Path(path_pipeline)
67
+ with open(path_pipeline, "r", encoding="utf8") as fp:
68
+ pipeline = json.load(fp)
69
+
70
+ # Corrobate that all functions are availables
71
+ available_functions = {pipe[0]: self.is_available(pipe[0])
72
+ for pipe in pipeline}
73
+
74
+ # TODO: change with the right Exception here
75
+ if not all(available_functions.values()):
76
+ print("""
77
+ Some functions are not availables.
78
+ Please use the @register_in_pipeline decorator to include this functions to the Pipeline.
79
+ """)
80
+ functions_not_availables = dict(filter(lambda item: item[0], available_functions.items()))
81
+ return [func_name for func_name, available in functions_not_availables.items()
82
+ if available is False]
83
+
84
+ self.pipeline = pipeline
85
+ print(f"Pipeline loaded from {path_pipeline!r}")
src/preprocessing.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """All functions that can be applied as preprocessing"""
2
+ from .pipeline import (
3
+ register_in_pipeline, # decorator to make available a function to use with Pipeline class
4
+ Pipeline,
5
+ )
6
+
7
+ @register_in_pipeline
8
+ def divide_by_max(npy,):
9
+ "The input npy divided by his maximum value"
10
+ return npy / npy.max()
11
+
12
+ @register_in_pipeline
13
+ def divide_by_sum(npy,):
14
+ "The input npy divided by the sum of their values"
15
+ return npy / npy.sum()
src/utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import numpy as np
3
+
4
+
5
+ def clean_seq(seq):
6
+ "Remove all characters different from A,C,G,T or N"
7
+ seq = seq.upper()
8
+ for letter in "BDEFHIJKLMOPQRSUVWXYZ":
9
+ seq = seq.replace(letter,"N")
10
+ return seq
11
+
12
+ def array2img(array):
13
+ "FCGR array to grayscale image"
14
+ max_color = 255
15
+ m, M = array.min(), array.max()
16
+ # rescale to [0,1]
17
+ img_rescaled = (array - m) / (M-m)
18
+
19
+ # invert colors black->white
20
+ img_array = np.ceil(max_color - img_rescaled*max_color)
21
+ img_array = np.array(img_array, dtype=np.int8)
22
+
23
+ # convert to Image
24
+ img_pil = Image.fromarray(img_array,'L')
25
+ return img_pil
trained-models/model-34-0.954.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:168a6bb54b1f3fca3febb5ee62c7ecfa5db762ae9812da180f3c383e98cfc18b
3
+ size 283851536
trained-models/preprocessing.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f991c810113366959d350e621ce3b618719326d3c9e638e5f45b47023329adf6
3
+ size 51