Spaces:
Runtime error
Runtime error
import requests | |
import time | |
import json | |
import os | |
import argparse | |
from tqdm import tqdm | |
def fetch_info_data(url): | |
data_list = [] | |
while url: | |
response = requests.get(url) | |
data = response.json() | |
data_list.extend(data["results"]) | |
url = data.get("next") | |
time.sleep(10) | |
return data_list | |
def download_single_interpro(interpro_id, out_dir): | |
interpro_dir = os.path.join(out_dir, interpro_id) | |
os.makedirs(interpro_dir, exist_ok=True) | |
start_url = f"https://www.ebi.ac.uk/interpro/api/protein/reviewed/entry/InterPro/{interpro_id}/?extra_fields=counters&page_size=20" | |
file = os.path.join(interpro_dir, "detail.json") | |
if os.path.exists(file): | |
return f"Skipping {interpro_id}, already exists" | |
info_data = [] | |
try: | |
info_data = fetch_info_data(start_url) | |
except: | |
return f"Error downloading {interpro_id}" | |
if not info_data: | |
return f"No data found for {interpro_id}" | |
with open(file, 'w') as f: | |
json.dump(info_data, f) | |
# Save metadata | |
meta_data = { | |
"metadata": {"accession": interpro_id}, | |
"num_proteins": len(info_data) | |
} | |
with open(os.path.join(interpro_dir, "meta.json"), 'w') as f: | |
json.dump(meta_data, f) | |
# Save UIDs | |
uids = [d["metadata"]["accession"] for d in info_data] | |
with open(os.path.join(interpro_dir, "uids.txt"), 'w') as f: | |
f.write("\n".join(uids)) | |
return f"Successfully downloaded {interpro_id}" | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--interpro_id", type=str, default=None) | |
parser.add_argument("--interpro_json", type=str, default=None) | |
parser.add_argument("--out_dir", type=str, default="download/interpro_domain") | |
parser.add_argument("--error_file", type=str, default=None) | |
parser.add_argument("--chunk_num", type=int, default=None) | |
parser.add_argument("--chunk_id", type=int, default=None) | |
args = parser.parse_args() | |
if not args.interpro_id and not args.interpro_json: | |
print("Error: Must provide either interpro_id or interpro_json") | |
exit(1) | |
os.makedirs(args.out_dir, exist_ok=True) | |
error_proteins = [] | |
error_messages = [] | |
if args.interpro_id: | |
result = download_single_interpro(args.interpro_id, args.out_dir) | |
print(result) | |
if "Error" in result or "No data" in result: | |
error_proteins.append(args.interpro_id) | |
error_messages.append(result) | |
elif args.interpro_json: | |
dir_path = os.path.dirname(args.interpro_json) | |
os.makedirs(dir_path, exist_ok=True) | |
try: | |
with open(args.interpro_json, 'r') as f: | |
all_data = json.load(f) | |
except FileNotFoundError: | |
print(f"Error: Could not find file {args.interpro_json}") | |
exit(1) | |
except json.JSONDecodeError: | |
print(f"Error: Invalid JSON file {args.interpro_json}") | |
exit(1) | |
if args.chunk_num is not None and args.chunk_id is not None: | |
start = args.chunk_id * len(all_data) // args.chunk_num | |
end = (args.chunk_id + 1) * len(all_data) // args.chunk_num | |
all_data = all_data[start:end] | |
for data in tqdm(all_data): | |
interpro_id = data["metadata"]["accession"] | |
result = download_single_interpro(interpro_id, args.out_dir) | |
if "Error" in result or "No data" in result: | |
error_proteins.append(interpro_id) | |
error_messages.append(result) | |
if error_proteins and args.error_file: | |
error_dict = {"protein": error_proteins, "error": error_messages} | |
error_file_dir = os.path.dirname(args.error_file) | |
os.makedirs(error_file_dir, exist_ok=True) | |
with open(args.error_file, 'w') as f: | |
for protein, message in zip(error_proteins, error_messages): | |
f.write(f"{protein} - {message}\n") | |