2dogey's picture
Upload folder using huggingface_hub
8918ac7 verified
import zipfile
import gzip
import shutil
import os
from tqdm import tqdm
from Bio.PDB import PDBParser, PPBuilder
def unzip(zipath, savefolder):
zf = zipfile.ZipFile(zipath)
zf.extractall(savefolder)
zf.close()
def ungzip(file, out_dir):
with gzip.open(file, 'rb') as f_in:
with open(os.path.join(out_dir, file.split('/')[-1][:-3]), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
def get_seq_from_pdb(pdb_file):
parser = PDBParser()
structure = parser.get_structure(pdb_file[-8:-4], pdb_file)
ppb = PPBuilder()
chain = structure[0]['A']
seq = ""
for pp in ppb.build_peptides(chain):
seq += pp.get_sequence()
return seq
def get_seqs_from_pdb(pdb_dir, out_file_path):
pdbs = os.listdir(pdb_dir)
with open(out_file_path, "w") as f:
for pdb in tqdm(pdbs):
seq = get_seq_from_pdb(os.path.join(pdb_dir, pdb))
f.write(f"> {pdb}\n{seq}\n")
def read_multi_fasta(file_path):
"""
params:
file_path: path to a fasta file
return:
a dictionary of sequences
"""
sequences = {}
current_sequence = ''
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
if line.startswith('>'):
if current_sequence:
sequences[header] = current_sequence
current_sequence = ''
header = line
else:
current_sequence += line
if current_sequence:
sequences[header] = current_sequence
return sequences
def make_uid_chunks(uid_file, chunk_dir=None, chunk_size=10000):
"""
params:
uid_file: path to a file containing a list of uniprot ids
chunk_size: size of each chunk
return:
files containing chunks of uniprot ids
"""
uids = [f.strip() for f in open(uid_file, "r").readlines()]
uid_path = os.path.dirname(uid_file)
if chunk_dir is None:
chunk_dir = uid_path + "/chunks"
os.makedirs(chunk_dir, exist_ok=True)
chunk_num = len(uids) // chunk_size + 1
chunk_name = uid_file.split('/')[-1].split(".")[0]
for i in range(chunk_num):
with open(os.path.join(chunk_dir, f"{chunk_name}_{i}.txt"), "w") as f:
f.write("\n".join(uids[i*chunk_size:(i+1)*chunk_size]))