from sklearn.base import BaseEstimator, TransformerMixin from bio_embeddings.embed.seqvec_embedder import SeqVecEmbedder import pandas as pd from sklearn.feature_selection import SelectKBest class Embedder(BaseEstimator, TransformerMixin): def __init__(self) -> None: self.embedder = SeqVecEmbedder() super().__init__() def fit(self, X, y=None): return self def transform(self, X: pd.DataFrame): return X.apply(self.embeddings, axis='columns') def embeddings(self, row): sequence = row[0] per_residue = self.embedder.embed(sequence) per_protein = self.embedder.reduce_per_protein(per_residue) return pd.Series(per_protein) class FeaturesSelector(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X: pd.DataFrame): return self.features_selector.transform(X)