File size: 2,798 Bytes
4321e7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import multiprocessing as mp
import requests as r
import argparse
from Bio import SeqIO
from io import StringIO
import pandas as pd
import numpy as np
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--file_path",
type=str,
required=True,
help="Path to the file that have a column cotaining uniprotid information.",
)
parser.add_argument(
"--sheet_name",
type=str,
default="Sheet1",
help="Name of the sheet to read. Default is Sheet1.",
)
parser.add_argument(
"--uniprotid_column",
type=str,
help="Name of the column that have uniprotid information. Default is None.",
)
parser.add_argument(
"--uniprotids_column",
type=str,
help="Name of the column that have uniprotids information. Default is None. The ids are expected to be separated by semi-colon, and the first id is used.",
)
parser.add_argument(
"--num_processes",
type=int,
default=2,
help="Number of processes to use.",
)
return parser.parse_args()
def fetch_sequence(row, cfg):
try:
baseURL = "http://www.uniprot.org/uniprot/"
uniprot_id = row[cfg.uniprotid_column]
URL = baseURL + uniprot_id + ".fasta"
response = r.post(URL)
Data = "".join(response.text)
Seq = StringIO(Data)
pSeq = list(SeqIO.parse(Seq, "fasta"))
return str(pSeq[0].seq)
except:
return None
def process_rows(df_chunk, cfg):
return [fetch_sequence(row, cfg) for idx, row in df_chunk.iterrows()]
if __name__ == "__main__":
config = parse_args()
if config.file_path.endswith(".xls"):
df = pd.read_excel(
config.file_path,
sheet_name=config.sheet_name,
)
else:
df = pd.read_csv(config.file_path)
if config.uniprotid_column is None and config.uniprotids_column is None:
raise ValueError(
"Either uniprotid_column or uniprotids_column should be provided."
)
if config.uniprotids_column is not None:
df = df.dropna(subset=[config.uniprotids_column]).reset_index(drop=True)
# use the first id and ignore the subunit and domain information
df["uniprotid"] = df[config.uniprotids_column].apply(
lambda x: x.split(";")[0].split("-")[0]
)
config.uniprotid_column = "uniprotid"
df_split = np.array_split(df, config.num_processes)
with mp.Pool(processes=config.num_processes) as pool:
results = pool.map(lambda x: process_rows(x, config), df_split)
aas = [seq for result in results for seq in result]
df["aa"] = aas
df.to_csv(f"{config.file_path.split('.')[0]}_with_aa.csv", index=False)
|