File size: 3,709 Bytes
ae5152f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 11:24:45 2020

@author: luol2
"""

import io
def nest_overlap_entity(nest_list):
    temp_result_list={}
    for i in range(0, len(nest_list)):
        hpoid=nest_list[i][3]
        if hpoid not in temp_result_list.keys():
            temp_result_list[hpoid]=nest_list[i]
        else:
            score=float(nest_list[i][4])
            old_score=float(temp_result_list[hpoid][4])
            if score>old_score: # retain higer score concept
                temp_result_list[hpoid]=nest_list[i]
    new_list=[]
    for hpoid in temp_result_list.keys():
        new_list.append(temp_result_list[hpoid])
    
    temp_result_list={} #same index, different ids
    for i in range(0, len(new_list)):
        ids=new_list[i][0]+' '+new_list[i][1]
        if ids not in temp_result_list.keys():
            temp_result_list[ids]=new_list[i]
        else:
            score=float(nest_list[i][4])
            old_score=float(temp_result_list[ids][4])
            if score>old_score: 
                temp_result_list[ids]=new_list[i]
    final_list=[]
    for ids in temp_result_list.keys():
        final_list.append(temp_result_list[ids]) 
    return final_list
def combine_ml_dict(dict_tsv,ml_tsv,nest=True):
    fin_dic=io.StringIO(dict_tsv)
    fin_ml=io.StringIO(ml_tsv)
    fout=io.StringIO()
    all_dic=fin_dic.read().strip().split('\n\n')
    all_ml=fin_ml.read().strip().split('\n\n')
    fin_dic.close()
    fin_ml.close()
    
    for i in range(0,len(all_dic)):
        lines_dic=all_dic[i].split('\n')
        lines_ml=all_ml[i].split('\n')
        entity_list={}
        for j in range(1,len(lines_dic)):
            seg=lines_dic[j].split('\t')
            entity_list[lines_dic[j]]=[int(seg[0]),int(seg[1])] #dict results score 1.00
        for j in range(1,len(lines_ml)):
            seg=lines_ml[j].split('\t')
            entity_list[lines_ml[j]]=[int(seg[0]),int(seg[1])]
                
        entity_list=sorted(entity_list.items(), key=lambda kv:(kv[1]), reverse=False)
        entity_list_sort=[]
        for ele in entity_list:
            entity_list_sort.append(ele[0])
        
        final_entity=[]
        if len(entity_list_sort)!=0:
            first_entity=entity_list_sort[0].split('\t')
            nest_list=[first_entity]
            max_eid=int(first_entity[1])

            for i in range(1,len(entity_list_sort)):
                segs=entity_list_sort[i].split('\t')
                if int(segs[0])> max_eid:
                    if len(nest_list)==1:
                        final_entity.append(nest_list[0])
                        nest_list=[]
                        nest_list.append(segs)
                        if int(segs[1])>max_eid:
                            max_eid=int(segs[1])
                    else:
                        tem=nest_overlap_entity(nest_list)
                        final_entity.extend(tem)
                        nest_list=[]
                        nest_list.append(segs)
                        if int(segs[1])>max_eid:
                            max_eid=int(segs[1])
                else:
                    nest_list.append(segs)
                    if int(segs[1])>max_eid:
                        max_eid=int(segs[1])
            if nest_list!=[]:
                if len(nest_list)==1:
                    final_entity.append(nest_list[0])

                else:
                    tem=nest_overlap_entity(nest_list)#find max entity
                    final_entity.extend(tem)
        
        fout.write(lines_ml[0]+'\n')
        for ele in final_entity:
            fout.write('\t'.join(ele)+'\n')
        fout.write('\n')
    return fout.getvalue()