dv4aby commited on
Commit
87d2e85
·
verified ·
1 Parent(s): 96772bb

Upload source code structural_encoder_v2.py

Browse files
Files changed (1) hide show
  1. structural_encoder_v2.py +196 -0
structural_encoder_v2.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Tuple, TYPE_CHECKING
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torch_geometric.data import HeteroData, Batch
8
+ from torch_geometric.nn import HeteroConv, GATConv, global_mean_pool
9
+ from transformers import AutoModel, AutoTokenizer
10
+ from tqdm import tqdm
11
+ import numpy as np
12
+
13
+ if TYPE_CHECKING:
14
+ import pandas as pd
15
+
16
+ # Import Builder from dataloader for inference/eval
17
+ from dataloader import CodeGraphBuilder
18
+
19
+ class RelationalGraphEncoder(nn.Module):
20
+ """R-GNN encoder over the AST+CFG heterogeneous graph."""
21
+
22
+ EDGE_TYPES = (
23
+ ("ast", "ast_parent_child", "ast"),
24
+ ("ast", "ast_child_parent", "ast"),
25
+ ("ast", "ast_next_sibling", "ast"),
26
+ ("ast", "ast_prev_sibling", "ast"),
27
+ ("token", "token_to_ast", "ast"),
28
+ ("ast", "ast_to_token", "token"),
29
+ ("stmt", "cfg", "stmt"),
30
+ ("stmt", "cfg_rev", "stmt"),
31
+ ("stmt", "stmt_to_ast", "ast"),
32
+ ("ast", "ast_to_stmt", "stmt"),
33
+ )
34
+
35
+ def __init__(self, hidden_dim: int = 256, out_dim: int = 768, num_layers: int = 2) -> None:
36
+ super().__init__()
37
+ self.hidden_dim = hidden_dim
38
+ self.out_dim = out_dim
39
+
40
+ self.ast_encoder = nn.Embedding(2048, hidden_dim)
41
+ self.token_encoder = nn.Embedding(8192, hidden_dim)
42
+ self.stmt_encoder = nn.Embedding(512, hidden_dim)
43
+
44
+ self.convs = nn.ModuleList()
45
+ for _ in range(num_layers):
46
+ hetero_modules = {
47
+ edge_type: GATConv((-1, -1), hidden_dim, add_self_loops=False)
48
+ for edge_type in self.EDGE_TYPES
49
+ }
50
+ hetero_conv = HeteroConv(hetero_modules, aggr="sum")
51
+ self.convs.append(hetero_conv)
52
+
53
+ self.output_proj = nn.Linear(hidden_dim, out_dim)
54
+
55
+ def _encode_nodes(self, data: HeteroData) -> Dict[str, torch.Tensor]:
56
+ device = self.ast_encoder.weight.device
57
+
58
+ def get_embed(node_type, encoder):
59
+ if node_type not in data.node_types:
60
+ return torch.zeros((0, self.hidden_dim), device=device)
61
+
62
+ x = data[node_type].get('x')
63
+ if x is None:
64
+ return torch.zeros((0, self.hidden_dim), device=device)
65
+
66
+ x = x.to(device)
67
+ return encoder(x)
68
+
69
+ x_dict = {
70
+ "ast": get_embed("ast", self.ast_encoder),
71
+ "token": get_embed("token", self.token_encoder),
72
+ "stmt": get_embed("stmt", self.stmt_encoder),
73
+ }
74
+ return x_dict
75
+
76
+ def forward(self, data: HeteroData) -> torch.Tensor:
77
+ device = next(self.parameters()).device
78
+ data = data.to(device)
79
+
80
+ x_dict = self._encode_nodes(data)
81
+
82
+ edge_index_dict = {}
83
+ for edge_type in self.EDGE_TYPES:
84
+ if edge_type in data.edge_index_dict:
85
+ edge_index_dict[edge_type] = data.edge_index_dict[edge_type]
86
+
87
+ for conv in self.convs:
88
+ x_dict = conv(x_dict, edge_index_dict)
89
+ x_dict = {key: F.relu(x) for key, x in x_dict.items()}
90
+
91
+ # Global Pooling
92
+ batch_size = data.num_graphs if hasattr(data, 'num_graphs') else 1
93
+
94
+ pooled_embeddings = []
95
+ for key, x in x_dict.items():
96
+ if x.size(0) == 0:
97
+ continue
98
+
99
+ if hasattr(data[key], 'batch') and data[key].batch is not None:
100
+ pool = global_mean_pool(x, data[key].batch, size=batch_size)
101
+ else:
102
+ # Logic for single graph without batch attribute (e.g. inference on one item)
103
+ pool = x.mean(dim=0, keepdim=True)
104
+ if pool.size(0) != batch_size:
105
+ # Should be 1
106
+ pass
107
+ pooled_embeddings.append(pool)
108
+
109
+ if not pooled_embeddings:
110
+ return torch.zeros((batch_size, self.out_dim), device=device)
111
+
112
+ # Average across node types [num_types, B, dim] -> [B, dim]
113
+ # We need to ensure all pools are [B, dim].
114
+ # If a graph misses a node type, its embedding for that type might be 0 or NaN?
115
+ # global_mean_pool returns 0 for empty batches.
116
+
117
+ graph_repr = torch.stack(pooled_embeddings).mean(dim=0)
118
+ return self.output_proj(graph_repr)
119
+
120
+
121
+ class GatedFusion(nn.Module):
122
+ def __init__(self, text_dim: int, graph_dim: int) -> None:
123
+ super().__init__()
124
+ self.graph_proj = nn.Linear(graph_dim, text_dim)
125
+ self.gate = nn.Linear(text_dim * 2, text_dim)
126
+
127
+ def forward(self, h_text: torch.Tensor, h_graph: torch.Tensor) -> torch.Tensor:
128
+ h_graph_proj = self.graph_proj(h_graph)
129
+ joint = torch.cat([h_text, h_graph_proj], dim=-1)
130
+ gate = torch.sigmoid(self.gate(joint))
131
+ return gate * h_text + (1.0 - gate) * h_graph_proj
132
+
133
+
134
+ class StructuralEncoderV2(nn.Module):
135
+ """Structural encoder that fuses GraphCodeBERT text features with AST+CFG graph context."""
136
+
137
+ def __init__(self, device: torch.device | str, graph_hidden_dim: int = 256, graph_layers: int = 2):
138
+ super().__init__()
139
+ self.device = torch.device(device)
140
+ self.text_tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
141
+ self.text_model = AutoModel.from_pretrained("microsoft/graphcodebert-base")
142
+ self.text_model.to(self.device)
143
+
144
+ self.graph_encoder = RelationalGraphEncoder(hidden_dim=graph_hidden_dim, out_dim=self.text_model.config.hidden_size, num_layers=graph_layers)
145
+ self.graph_encoder.to(self.device)
146
+
147
+ self.fusion = GatedFusion(self.text_model.config.hidden_size, self.text_model.config.hidden_size)
148
+ self.fusion.to(self.device)
149
+
150
+ def encode_text(self, codes: List[str]) -> torch.Tensor:
151
+ inputs = self.text_tokenizer(
152
+ codes,
153
+ padding=True,
154
+ truncation=True,
155
+ max_length=512,
156
+ return_tensors="pt",
157
+ ).to(self.device)
158
+ outputs = self.text_model(**inputs)
159
+ return outputs.last_hidden_state[:, 0, :]
160
+
161
+ def forward(self, codes: List[str], graph_batch: Batch | HeteroData) -> torch.Tensor:
162
+ text_embeddings = self.encode_text(codes)
163
+ graph_embeddings = self.graph_encoder(graph_batch)
164
+ return self.fusion(text_embeddings, graph_embeddings)
165
+
166
+ def generate_embeddings(self, df: "pd.DataFrame", batch_size: int = 8, save_path: str | None = None, desc: str = "Structural V2 embeddings") -> np.ndarray:
167
+ # Create local builder for inference
168
+ builder = CodeGraphBuilder()
169
+
170
+ codes = df["code"].tolist()
171
+ batches = range(0, len(codes), batch_size)
172
+ all_embeddings: List[torch.Tensor] = []
173
+
174
+ for start in tqdm(batches, desc=desc):
175
+ batch_codes = codes[start:start + batch_size]
176
+
177
+ # Parallelism here not strictly needed for eval unless slow, but we do it simply
178
+ data_list = [builder.build(c) for c in batch_codes]
179
+ graph_batch = Batch.from_data_list(data_list)
180
+
181
+ with torch.no_grad():
182
+ fused = self.forward(batch_codes, graph_batch)
183
+ all_embeddings.append(fused.cpu())
184
+
185
+ embeddings = torch.cat(all_embeddings, dim=0).numpy().astype("float32")
186
+ if save_path is not None:
187
+ np.save(save_path, embeddings)
188
+ return embeddings
189
+
190
+ def load_checkpoint(self, checkpoint_path: str, map_location: str | torch.device = "cpu", strict: bool = True) -> None:
191
+ if not checkpoint_path:
192
+ raise ValueError("checkpoint_path must be provided")
193
+ state = torch.load(checkpoint_path, map_location=map_location)
194
+ if isinstance(state, dict) and "state_dict" in state:
195
+ state = state["state_dict"]
196
+ self.load_state_dict(state, strict=strict)