File size: 8,105 Bytes
93e1b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# %%
import rdflib
import pandas as pd


def get_graph():
    # File with the graph: MGCONSO.RRF
    df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0)
    # Rename the column '#CUI' to 'CUI'
    df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True)
    # Remove the last column, it's empty
    df_concepts = df_concepts.iloc[:, :-1]
    print(df_concepts.head())
    # Create a graph
    g = rdflib.Graph()
    # Bind the namespace
    g.bind("medgen", "http://identifiers.org/medgen/")
    # Iterate over the rows
    for i, row in df_concepts.iterrows():
        if row.SUPPRESS == "Y":
            continue
        if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P":
            # Create the URI
            uri = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI}")
            # Add the triple
            g.add((uri, rdflib.RDFS.label, rdflib.Literal(row.STR)))

    # Now, load MGREL.RRF
    df_relations = pd.read_csv("MGREL.RRF", sep="|", header=0)
    # Rename the column '#CUI1' to 'CUI1'
    df_relations.rename(columns={"#CUI1": "CUI1"}, inplace=True)
    # Remove the last column, it's empty
    df_relations = df_relations.iloc[:, :-1]
    print(df_relations.head())
    # Iterate over the rows
    for i, row in df_relations.iterrows():
        if row.SUPPRESS == "Y":
            continue
        # Create the URI
        uri1 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI1}")
        uri2 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI2}")
        # Add the triple
        if row.REL == "RL":
            g.add((uri1, rdflib.URIRef("related"), uri2))
            continue
        g.add((uri1, rdflib.URIRef(f"http://identifiers.org/medgen/{row.REL}"), uri2))
    
    return g

def apply_rules_to_graph(g):
    # Now, apply this rule: if two nodes have the same parent (i.e. node1 RB node2 and node3 RB node2, then node1 related node3)
    # Query the graph to get the parents of each node
    query = """
    PREFIX medgen: <http://identifiers.org/medgen/>
    SELECT DISTINCT ?parent ?child1 ?child2 WHERE {
        ?parent medgen:RN ?child1 .
        ?parent medgen:RN ?child2 .
        FILTER (?child1 != ?child2)
    }
    """
    res = g.query(query)
    for row in res:
        g.add((row.child1, rdflib.URIRef("related"), row.child2))
        g.add((row.child2, rdflib.URIRef("related"), row.child1))
    return g


def get_labels_of_entities():
    """
    Returns a dictionary with the labels of the entities
    """
    # File with the graph: MGCONSO.RRF
    df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0)
    # Rename the column '#CUI' to 'CUI'
    df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True)
    # Remove the last column, it's empty
    df_concepts = df_concepts.iloc[:, :-1]
    # Create a dictionary
    labels_of_entities = {}
    # Iterate over the rows
    for i, row in df_concepts.iterrows():
        if row.SUPPRESS == "Y":
            continue
        if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P":
            labels_of_entities[f"http://identifiers.org/medgen/{row.CUI}"] = row.STR
    return labels_of_entities


def generate_triples_file(graph: rdflib.Graph):
    with open("triples_medgen.tsv", "w") as f:
        # Output the triples ?s ?p ?o
        for s, p, o in graph.triples((None, rdflib.URIRef("related"), None)):
            f.write(f"{s}\t{p}\t{o}\n")
        for s, p, o in graph.triples(
            (None, rdflib.URIRef("http://identifiers.org/medgen/RN"), None)
        ):
            f.write(f"{s}\t{p}\t{o}\n")
        for s, p, o in graph.triples(
            (None, rdflib.URIRef("http://identifiers.org/medgen/RB"), None)
        ):
            f.write(f"{s}\t{p}\t{o}\n")
        for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/PAR"), None)):
            f.write(f"{s}\t{p}\t{o}\n")
        for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/CHD"), None)):
            f.write(f"{s}\t{p}\t{o}\n")


def save_adjacency_matrix():
    # Load the triples file generated
    df = pd.read_csv("triples_medgen.tsv", sep="\t", header=None)
    # Now output the adjacency matrix, where the rows are the subjects and the columns are the objects
    # The values are the relations (i.e. 0 if no relation and 1 if there is a relation)
    # Get the unique subjects and objects
    subjects = df[0].unique()
    objects = df[2].unique()
    # Create the adjacency matrix
    adj_matrix = pd.DataFrame(0, index=subjects, columns=objects)
    # Iterate over the rows
    for i, row in df.iterrows():
        adj_matrix.loc[row[0], row[2]] = 1
    # Save the adjacency matrix
    adj_matrix.to_csv("adjacency_matrix.mat", sep="\t")


# %%
g = get_graph()
# %%
g = apply_rules_to_graph(g)
# %%
labels_of_entities = get_labels_of_entities()
# %%
generate_triples_file(g)
# %%
from pykeen.triples import TriplesFactory
from pykeen.models import TuckER, TransE, TransH
from pykeen.pipeline import pipeline

tf = TriplesFactory.from_path("triples_medgen.tsv")
print(f"Triples count: {tf.num_triples}")
training, testing, validation = tf.split([0.8, 0.1, 0.1], random_state=42, randomize_cleanup=False)
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model=TransE,
    stopper="early",
    epochs=500,  # short epochs for testing - you should go
    # higher, especially with early stopper enabled
)
result.save_to_directory("doctests/test_unstratified_stopped_complex")
# %%
import torch

alzheimers = "http://identifiers.org/medgen/C1843013"
# What does the model predict for Alzheimer's disease?
model = result.model
alzheimers_id = tf.entity_to_id[alzheimers]
relation_id = tf.relation_to_id["related"]

batch_to_predict = torch.tensor([[alzheimers_id, relation_id]])

alzheimers_pred = model.predict_t(hr_batch=batch_to_predict)

print(alzheimers_pred.shape)
# Get the indices of the top 10 predictions
top10 = torch.topk(alzheimers_pred, 10, largest=True)
# Get the entities
entities = tf.entity_id_to_label
print(top10.indices)
for i in top10.indices[0]:
    # Ask the graph, what is the label for this entity?
    query = f"""
    PREFIX medgen: <http://identifiers.org/medgen/>
    SELECT ?label WHERE {{
        <{entities[i.item()]}> <http://www.w3.org/2000/01/rdf-schema#label> ?label
    }}
    """
    res = g.query(query)
    for i, row in enumerate(res):
        print(f"{i}: {row}")
# %%
from pykeen.nn.representation import Embedding

# Get the embeddings of all the entities
entity_ids = torch.LongTensor(list(tf.entity_to_id.values())).cuda()
entity_embeddings: Embedding = model.entity_representations[0]._embeddings(entity_ids)
# Get the embeddings of the relations
relation_ids = torch.LongTensor(list(tf.relation_to_id.values())).cuda()
relation_embeddings: Embedding = model.relation_representations[0]._embeddings(
    relation_ids
)

print(f"Entity embeddings shape: {entity_embeddings.shape}")
print(f"Relation embeddings shape: {relation_embeddings.shape}")

# Store the embeddings in a DataFrame
df = pd.DataFrame(
    {
        "embedding": entity_embeddings.detach().cpu().tolist(),
        "label": [
            labels_of_entities[tf.entity_id_to_label[i]] if tf.entity_id_to_label[i] in labels_of_entities else ""
            for i in range(len(tf.entity_id_to_label))
        ],
        "uri": [
            f"{tf.entity_id_to_label[i]}" for i in range(len(tf.entity_id_to_label))
        ],
    },
    index=range(len(entity_embeddings)),
)
## Save the DataFrame
df.to_csv("entity_embeddings.csv")

# Store the embeddings in a DataFrame
df = pd.DataFrame(
    {
        "embedding": relation_embeddings.detach().cpu().tolist(),
        "label": [
            tf.relation_id_to_label[i] for i in range(len(tf.relation_id_to_label))
        ],
        "uri": [
            f"{tf.relation_id_to_label[i]}" for i in range(len(tf.relation_id_to_label))
        ],
    },
    index=range(len(relation_embeddings)),
)
## Save the DataFrame
df.to_csv("relation_embeddings.csv")

# %%
import pyobo

pyobo.get_name("mesh", "16793")

# %%