VenusFactory / src /crawler /metadata /download_interpro.py
2dogey's picture
Upload folder using huggingface_hub
8918ac7 verified
import requests
import time
import json
import os
import argparse
from tqdm import tqdm
def fetch_info_data(url):
data_list = []
while url:
response = requests.get(url)
data = response.json()
data_list.extend(data["results"])
url = data.get("next")
time.sleep(10)
return data_list
def download_single_interpro(interpro_id, out_dir):
interpro_dir = os.path.join(out_dir, interpro_id)
os.makedirs(interpro_dir, exist_ok=True)
start_url = f"https://www.ebi.ac.uk/interpro/api/protein/reviewed/entry/InterPro/{interpro_id}/?extra_fields=counters&page_size=20"
file = os.path.join(interpro_dir, "detail.json")
if os.path.exists(file):
return f"Skipping {interpro_id}, already exists"
info_data = []
try:
info_data = fetch_info_data(start_url)
except:
return f"Error downloading {interpro_id}"
if not info_data:
return f"No data found for {interpro_id}"
with open(file, 'w') as f:
json.dump(info_data, f)
# Save metadata
meta_data = {
"metadata": {"accession": interpro_id},
"num_proteins": len(info_data)
}
with open(os.path.join(interpro_dir, "meta.json"), 'w') as f:
json.dump(meta_data, f)
# Save UIDs
uids = [d["metadata"]["accession"] for d in info_data]
with open(os.path.join(interpro_dir, "uids.txt"), 'w') as f:
f.write("\n".join(uids))
return f"Successfully downloaded {interpro_id}"
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--interpro_id", type=str, default=None)
parser.add_argument("--interpro_json", type=str, default=None)
parser.add_argument("--out_dir", type=str, default="download/interpro_domain")
parser.add_argument("--error_file", type=str, default=None)
parser.add_argument("--chunk_num", type=int, default=None)
parser.add_argument("--chunk_id", type=int, default=None)
args = parser.parse_args()
if not args.interpro_id and not args.interpro_json:
print("Error: Must provide either interpro_id or interpro_json")
exit(1)
os.makedirs(args.out_dir, exist_ok=True)
error_proteins = []
error_messages = []
if args.interpro_id:
result = download_single_interpro(args.interpro_id, args.out_dir)
print(result)
if "Error" in result or "No data" in result:
error_proteins.append(args.interpro_id)
error_messages.append(result)
elif args.interpro_json:
dir_path = os.path.dirname(args.interpro_json)
os.makedirs(dir_path, exist_ok=True)
try:
with open(args.interpro_json, 'r') as f:
all_data = json.load(f)
except FileNotFoundError:
print(f"Error: Could not find file {args.interpro_json}")
exit(1)
except json.JSONDecodeError:
print(f"Error: Invalid JSON file {args.interpro_json}")
exit(1)
if args.chunk_num is not None and args.chunk_id is not None:
start = args.chunk_id * len(all_data) // args.chunk_num
end = (args.chunk_id + 1) * len(all_data) // args.chunk_num
all_data = all_data[start:end]
for data in tqdm(all_data):
interpro_id = data["metadata"]["accession"]
result = download_single_interpro(interpro_id, args.out_dir)
if "Error" in result or "No data" in result:
error_proteins.append(interpro_id)
error_messages.append(result)
if error_proteins and args.error_file:
error_dict = {"protein": error_proteins, "error": error_messages}
error_file_dir = os.path.dirname(args.error_file)
os.makedirs(error_file_dir, exist_ok=True)
with open(args.error_file, 'w') as f:
for protein, message in zip(error_proteins, error_messages):
f.write(f"{protein} - {message}\n")