Spaces:
Runtime error
Runtime error
import zipfile | |
import gzip | |
import shutil | |
import os | |
from tqdm import tqdm | |
from Bio.PDB import PDBParser, PPBuilder | |
def unzip(zipath, savefolder): | |
zf = zipfile.ZipFile(zipath) | |
zf.extractall(savefolder) | |
zf.close() | |
def ungzip(file, out_dir): | |
with gzip.open(file, 'rb') as f_in: | |
with open(os.path.join(out_dir, file.split('/')[-1][:-3]), 'wb') as f_out: | |
shutil.copyfileobj(f_in, f_out) | |
def get_seq_from_pdb(pdb_file): | |
parser = PDBParser() | |
structure = parser.get_structure(pdb_file[-8:-4], pdb_file) | |
ppb = PPBuilder() | |
chain = structure[0]['A'] | |
seq = "" | |
for pp in ppb.build_peptides(chain): | |
seq += pp.get_sequence() | |
return seq | |
def get_seqs_from_pdb(pdb_dir, out_file_path): | |
pdbs = os.listdir(pdb_dir) | |
with open(out_file_path, "w") as f: | |
for pdb in tqdm(pdbs): | |
seq = get_seq_from_pdb(os.path.join(pdb_dir, pdb)) | |
f.write(f"> {pdb}\n{seq}\n") | |
def read_multi_fasta(file_path): | |
""" | |
params: | |
file_path: path to a fasta file | |
return: | |
a dictionary of sequences | |
""" | |
sequences = {} | |
current_sequence = '' | |
with open(file_path, 'r') as file: | |
for line in file: | |
line = line.strip() | |
if line.startswith('>'): | |
if current_sequence: | |
sequences[header] = current_sequence | |
current_sequence = '' | |
header = line | |
else: | |
current_sequence += line | |
if current_sequence: | |
sequences[header] = current_sequence | |
return sequences | |
def make_uid_chunks(uid_file, chunk_dir=None, chunk_size=10000): | |
""" | |
params: | |
uid_file: path to a file containing a list of uniprot ids | |
chunk_size: size of each chunk | |
return: | |
files containing chunks of uniprot ids | |
""" | |
uids = [f.strip() for f in open(uid_file, "r").readlines()] | |
uid_path = os.path.dirname(uid_file) | |
if chunk_dir is None: | |
chunk_dir = uid_path + "/chunks" | |
os.makedirs(chunk_dir, exist_ok=True) | |
chunk_num = len(uids) // chunk_size + 1 | |
chunk_name = uid_file.split('/')[-1].split(".")[0] | |
for i in range(chunk_num): | |
with open(os.path.join(chunk_dir, f"{chunk_name}_{i}.txt"), "w") as f: | |
f.write("\n".join(uids[i*chunk_size:(i+1)*chunk_size])) |