File size: 4,159 Bytes
8918ac7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
import time
import json
import os
import argparse
from tqdm import tqdm

def fetch_info_data(url):
    data_list = []
    while url:
        response = requests.get(url)
        data = response.json()
        data_list.extend(data["results"])
        url = data.get("next")
        time.sleep(10)
    return data_list

def download_single_interpro(interpro_id, out_dir):
    interpro_dir = os.path.join(out_dir, interpro_id)
    os.makedirs(interpro_dir, exist_ok=True)
    
    start_url = f"https://www.ebi.ac.uk/interpro/api/protein/reviewed/entry/InterPro/{interpro_id}/?extra_fields=counters&page_size=20"
    
    file = os.path.join(interpro_dir, "detail.json")
    if os.path.exists(file):
        return f"Skipping {interpro_id}, already exists"
        
    info_data = []
    try:
        info_data = fetch_info_data(start_url)
    except:
        return f"Error downloading {interpro_id}"
    
    if not info_data:
        return f"No data found for {interpro_id}"
        
    with open(file, 'w') as f:
        json.dump(info_data, f)
    
    # Save metadata
    meta_data = {
        "metadata": {"accession": interpro_id},
        "num_proteins": len(info_data)
    }
    with open(os.path.join(interpro_dir, "meta.json"), 'w') as f:
        json.dump(meta_data, f)
    
    # Save UIDs
    uids = [d["metadata"]["accession"] for d in info_data]
    with open(os.path.join(interpro_dir, "uids.txt"), 'w') as f:
        f.write("\n".join(uids))
    
    return f"Successfully downloaded {interpro_id}"

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--interpro_id", type=str, default=None)
    parser.add_argument("--interpro_json", type=str, default=None)
    parser.add_argument("--out_dir", type=str, default="download/interpro_domain")
    parser.add_argument("--error_file", type=str, default=None)
    parser.add_argument("--chunk_num", type=int, default=None)
    parser.add_argument("--chunk_id", type=int, default=None)
    args = parser.parse_args()
    
    if not args.interpro_id and not args.interpro_json:
        print("Error: Must provide either interpro_id or interpro_json")
        exit(1)
    
    os.makedirs(args.out_dir, exist_ok=True)
    error_proteins = []
    error_messages = []
    
    if args.interpro_id:
        result = download_single_interpro(args.interpro_id, args.out_dir)
        print(result)
        if "Error" in result or "No data" in result:
            error_proteins.append(args.interpro_id)
            error_messages.append(result)
    
    elif args.interpro_json:
        dir_path = os.path.dirname(args.interpro_json)
        os.makedirs(dir_path, exist_ok=True)
        
        try:
            with open(args.interpro_json, 'r') as f:
                all_data = json.load(f)
        except FileNotFoundError:
            print(f"Error: Could not find file {args.interpro_json}")
            exit(1)
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON file {args.interpro_json}")
            exit(1)
            
        if args.chunk_num is not None and args.chunk_id is not None:
            start = args.chunk_id * len(all_data) // args.chunk_num
            end = (args.chunk_id + 1) * len(all_data) // args.chunk_num
            all_data = all_data[start:end]
        
        for data in tqdm(all_data):
            interpro_id = data["metadata"]["accession"]
            result = download_single_interpro(interpro_id, args.out_dir)
            if "Error" in result or "No data" in result:
                error_proteins.append(interpro_id)
                error_messages.append(result)

    if error_proteins and args.error_file:
        error_dict = {"protein": error_proteins, "error": error_messages}
        error_file_dir = os.path.dirname(args.error_file)
        os.makedirs(error_file_dir, exist_ok=True)
        with open(args.error_file, 'w') as f:
            for protein, message in zip(error_proteins, error_messages):
                f.write(f"{protein} - {message}\n")