VenusFactory / src /crawler /structure /blastp_af_collect.py
2dogey's picture
Upload folder using huggingface_hub
8918ac7 verified
import argparse
import os
import ssl
from urllib import request
from lxml import etree
from tqdm import tqdm
from fake_useragent import UserAgent
ua = UserAgent()
def process(args):
# instanciate parser
tree = etree.parse(args.html, parser=etree.HTMLParser(recover=True))
# get all the links
blast_items = tree.xpath('//*[@id="root"]/div/div/div/main/div[2]/div[2]/section/div/div/span[6]/a/text()')
context = ssl._create_unverified_context()
base_url = "https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/result/"
bar = tqdm(blast_items)
names = []
for item in bar:
bar.set_postfix({"current": item})
trg_url = base_url + item + "/accs"
req = request.Request(trg_url,
headers={
"Accept": "application/json",
'user-agent': ua.random
})
res = request.urlopen(req, context=context)
payload = [p[5:] for p in res.read().decode().split("\n")[:-1]]
names.extend(payload)
# remove duplicate
names = list(set(names))
lenth = len(names)
max_i = lenth//args.chunk_size+1
for i in range(max_i):
names_ = names[i*args.chunk_size: (i+1)*args.chunk_size]
with open(os.path.join(args.output, f"af_raw_{args.protein_name}_{i}.txt"), "w") as f:
for name in names_:
f.write(name+"\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--protein_name", type=str, default="CM", required=False)
parser.add_argument("--html", type=str, default="data/CM/CM.html", required=False)
parser.add_argument("--output", type=str, default="data/CM", required=False)
parser.add_argument("--chunk_size", type=int, default=5000, required=False)
args = parser.parse_args()
process(args)