Spaces:
Runtime error
Runtime error
| import zipfile | |
| import gzip | |
| import shutil | |
| import os | |
| from tqdm import tqdm | |
| from Bio.PDB import PDBParser, PPBuilder | |
| def unzip(zipath, savefolder): | |
| zf = zipfile.ZipFile(zipath) | |
| zf.extractall(savefolder) | |
| zf.close() | |
| def ungzip(file, out_dir): | |
| with gzip.open(file, 'rb') as f_in: | |
| with open(os.path.join(out_dir, file.split('/')[-1][:-3]), 'wb') as f_out: | |
| shutil.copyfileobj(f_in, f_out) | |
| def get_seq_from_pdb(pdb_file): | |
| parser = PDBParser() | |
| structure = parser.get_structure(pdb_file[-8:-4], pdb_file) | |
| ppb = PPBuilder() | |
| chain = structure[0]['A'] | |
| seq = "" | |
| for pp in ppb.build_peptides(chain): | |
| seq += pp.get_sequence() | |
| return seq | |
| def get_seqs_from_pdb(pdb_dir, out_file_path): | |
| pdbs = os.listdir(pdb_dir) | |
| with open(out_file_path, "w") as f: | |
| for pdb in tqdm(pdbs): | |
| seq = get_seq_from_pdb(os.path.join(pdb_dir, pdb)) | |
| f.write(f"> {pdb}\n{seq}\n") | |
| def read_multi_fasta(file_path): | |
| """ | |
| params: | |
| file_path: path to a fasta file | |
| return: | |
| a dictionary of sequences | |
| """ | |
| sequences = {} | |
| current_sequence = '' | |
| with open(file_path, 'r') as file: | |
| for line in file: | |
| line = line.strip() | |
| if line.startswith('>'): | |
| if current_sequence: | |
| sequences[header] = current_sequence | |
| current_sequence = '' | |
| header = line | |
| else: | |
| current_sequence += line | |
| if current_sequence: | |
| sequences[header] = current_sequence | |
| return sequences | |
| def make_uid_chunks(uid_file, chunk_dir=None, chunk_size=10000): | |
| """ | |
| params: | |
| uid_file: path to a file containing a list of uniprot ids | |
| chunk_size: size of each chunk | |
| return: | |
| files containing chunks of uniprot ids | |
| """ | |
| uids = [f.strip() for f in open(uid_file, "r").readlines()] | |
| uid_path = os.path.dirname(uid_file) | |
| if chunk_dir is None: | |
| chunk_dir = uid_path + "/chunks" | |
| os.makedirs(chunk_dir, exist_ok=True) | |
| chunk_num = len(uids) // chunk_size + 1 | |
| chunk_name = uid_file.split('/')[-1].split(".")[0] | |
| for i in range(chunk_num): | |
| with open(os.path.join(chunk_dir, f"{chunk_name}_{i}.txt"), "w") as f: | |
| f.write("\n".join(uids[i*chunk_size:(i+1)*chunk_size])) |