update
Browse files- hf-logo.svg +8 -0
- modular_graph_and_candidates.py +126 -31
hf-logo.svg
ADDED
|
|
modular_graph_and_candidates.py
CHANGED
|
@@ -94,10 +94,10 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
| 94 |
out[(m1, m2)] = s
|
| 95 |
return out
|
| 96 |
|
| 97 |
-
|
| 98 |
-
def
|
| 99 |
-
model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
|
| 100 |
-
model.max_seq_length =
|
| 101 |
texts = {}
|
| 102 |
|
| 103 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
@@ -113,7 +113,7 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 113 |
all_embeddings = []
|
| 114 |
|
| 115 |
print("Encoding embeddings...")
|
| 116 |
-
batch_size =
|
| 117 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
| 118 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
| 119 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
|
@@ -122,7 +122,7 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 122 |
embeddings = np.vstack(all_embeddings) # [N, D]
|
| 123 |
|
| 124 |
print("Computing pairwise similarities...")
|
| 125 |
-
sims = embeddings @ embeddings.T
|
| 126 |
|
| 127 |
out = {}
|
| 128 |
for i in range(len(names)):
|
|
@@ -132,6 +132,59 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 132 |
out[(names[i], names[j])] = float(s)
|
| 133 |
return out
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
|
|
@@ -261,34 +314,72 @@ def generate_html(graph: dict) -> str:
|
|
| 261 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 262 |
CSS = """
|
| 263 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
"""
|
| 277 |
|
| 278 |
JS = """
|
| 279 |
-
|
| 280 |
function updateVisibility() {
|
| 281 |
const show = document.getElementById('toggleRed').checked;
|
| 282 |
svg.selectAll('.link.cand').style('display', show ? null : 'none');
|
| 283 |
svg.selectAll('.node.cand').style('display', show ? null : 'none');
|
| 284 |
-
svg.selectAll('.link-label')
|
| 285 |
-
.filter(d => d.cand)
|
| 286 |
-
.style('display', show ? null : 'none');
|
| 287 |
}
|
| 288 |
-
|
| 289 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
| 290 |
|
| 291 |
-
|
| 292 |
const graph = __GRAPH_DATA__;
|
| 293 |
const W = innerWidth, H = innerHeight;
|
| 294 |
const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
|
|
@@ -311,17 +402,21 @@ const node = g.selectAll('g.node')
|
|
| 311 |
.attr('class', d => `node ${d.cls}`)
|
| 312 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
| 313 |
|
| 314 |
-
node.filter(d => d.cls==='base')
|
| 315 |
-
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
node.append('text').attr('class','node-label').attr('dy','-2.4em').text(d => d.id);
|
| 318 |
|
| 319 |
const sim = d3.forceSimulation(graph.nodes)
|
| 320 |
-
.force('link', d3.forceLink(graph.links).id(d => d.id).distance(520))
|
| 321 |
-
.force('charge', d3.forceManyBody().strength(-600))
|
| 322 |
.force('center', d3.forceCenter(W / 2, H / 2))
|
| 323 |
-
.force('collide', d3.forceCollide(d =>
|
| 324 |
-
|
| 325 |
|
| 326 |
sim.on('tick', () => {
|
| 327 |
link.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y)
|
|
|
|
| 94 |
out[(m1, m2)] = s
|
| 95 |
return out
|
| 96 |
|
| 97 |
+
#@spaces.GPU
|
| 98 |
+
def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 99 |
+
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
| 100 |
+
model.max_seq_length = 8192 # truncate overly long modeling files
|
| 101 |
texts = {}
|
| 102 |
|
| 103 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
|
|
| 113 |
all_embeddings = []
|
| 114 |
|
| 115 |
print("Encoding embeddings...")
|
| 116 |
+
batch_size = 2
|
| 117 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
| 118 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
| 119 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
|
|
|
| 122 |
embeddings = np.vstack(all_embeddings) # [N, D]
|
| 123 |
|
| 124 |
print("Computing pairwise similarities...")
|
| 125 |
+
sims = embeddings @ embeddings.T
|
| 126 |
|
| 127 |
out = {}
|
| 128 |
for i in range(len(names)):
|
|
|
|
| 132 |
out[(names[i], names[j])] = float(s)
|
| 133 |
return out
|
| 134 |
|
| 135 |
+
#@spaces.GPU
|
| 136 |
+
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 137 |
+
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
| 138 |
+
|
| 139 |
+
# Hard-cap by backend max positions (prevents IndexError in self.wpe)
|
| 140 |
+
try:
|
| 141 |
+
cfg = model[0].auto_model.config
|
| 142 |
+
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
| 143 |
+
except Exception:
|
| 144 |
+
pos_limit = 1024 # conservative fallback if config is odd
|
| 145 |
+
|
| 146 |
+
seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
|
| 147 |
+
model.max_seq_length = seq_len # SentenceTransformer wrapper
|
| 148 |
+
model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
|
| 149 |
+
model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
|
| 150 |
+
|
| 151 |
+
texts = {}
|
| 152 |
+
for name in tqdm(missing, desc="Reading modeling files"):
|
| 153 |
+
code = ""
|
| 154 |
+
for py in (models_root / name).rglob("modeling_*.py"):
|
| 155 |
+
try:
|
| 156 |
+
code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
|
| 157 |
+
except Exception:
|
| 158 |
+
continue
|
| 159 |
+
texts[name] = code.strip() or " "
|
| 160 |
+
|
| 161 |
+
names = list(texts)
|
| 162 |
+
all_embeddings = []
|
| 163 |
+
|
| 164 |
+
print("Encoding embeddings...")
|
| 165 |
+
batch_size = 2
|
| 166 |
+
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
| 167 |
+
batch = [texts[n] for n in names[i:i+batch_size]]
|
| 168 |
+
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 169 |
+
all_embeddings.append(emb)
|
| 170 |
+
|
| 171 |
+
# Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
|
| 172 |
+
import numpy as np
|
| 173 |
+
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
| 174 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
| 175 |
+
embeddings = embeddings / norms
|
| 176 |
+
|
| 177 |
+
print("Computing pairwise similarities...")
|
| 178 |
+
sims_mat = embeddings @ embeddings.T
|
| 179 |
+
|
| 180 |
+
out = {}
|
| 181 |
+
for i in range(len(names)):
|
| 182 |
+
for j in range(i + 1, len(names)):
|
| 183 |
+
s = float(sims_mat[i, j])
|
| 184 |
+
if s >= thr:
|
| 185 |
+
out[(names[i], names[j])] = s
|
| 186 |
+
return out
|
| 187 |
+
|
| 188 |
|
| 189 |
|
| 190 |
|
|
|
|
| 314 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 315 |
CSS = """
|
| 316 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
|
| 317 |
+
|
| 318 |
+
:root{
|
| 319 |
+
--bg:#ffffff;
|
| 320 |
+
--text:#222222;
|
| 321 |
+
--muted:#555555;
|
| 322 |
+
--outline:#ffffff;
|
| 323 |
+
}
|
| 324 |
+
@media (prefers-color-scheme: dark){
|
| 325 |
+
:root{
|
| 326 |
+
--bg:#0b0d10;
|
| 327 |
+
--text:#e8e8e8;
|
| 328 |
+
--muted:#c8c8c8;
|
| 329 |
+
--outline:#000000;
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
body{ margin:0; font-family:'Inter',Arial,sans-serif; background:var(--bg); overflow:hidden; }
|
| 334 |
+
svg{ width:100vw; height:100vh; }
|
| 335 |
+
|
| 336 |
+
.link{ stroke:#999; stroke-opacity:.6; }
|
| 337 |
+
.link.cand{ stroke:#e63946; stroke-width:2.5; }
|
| 338 |
+
|
| 339 |
+
.node-label{
|
| 340 |
+
fill:var(--text);
|
| 341 |
+
pointer-events:none;
|
| 342 |
+
text-anchor:middle;
|
| 343 |
+
font-weight:600;
|
| 344 |
+
paint-order:stroke fill;
|
| 345 |
+
stroke:var(--outline);
|
| 346 |
+
stroke-width:3px;
|
| 347 |
+
}
|
| 348 |
+
.link-label{
|
| 349 |
+
fill:var(--muted);
|
| 350 |
+
pointer-events:none;
|
| 351 |
+
text-anchor:middle;
|
| 352 |
+
font-size:10px;
|
| 353 |
+
paint-order:stroke fill;
|
| 354 |
+
stroke:var(--bg);
|
| 355 |
+
stroke-width:2px;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
.node.base image{ width:60px; height:60px; transform:translate(-30px,-30px); }
|
| 359 |
+
.node.derived circle{ fill:#1f77b4; }
|
| 360 |
+
.node.cand circle, .node.cand path{ fill:#e63946; }
|
| 361 |
+
|
| 362 |
+
#legend{
|
| 363 |
+
position:fixed; top:18px; left:18px;
|
| 364 |
+
background:rgba(255,255,255,.92);
|
| 365 |
+
padding:18px 28px; border-radius:10px; border:1.5px solid #bbb;
|
| 366 |
+
font-size:18px; box-shadow:0 2px 8px rgba(0,0,0,.08);
|
| 367 |
+
}
|
| 368 |
+
@media (prefers-color-scheme: dark){
|
| 369 |
+
#legend{ background:rgba(20,22,25,.92); color:#e8e8e8; border-color:#444; }
|
| 370 |
+
}
|
| 371 |
"""
|
| 372 |
|
| 373 |
JS = """
|
|
|
|
| 374 |
function updateVisibility() {
|
| 375 |
const show = document.getElementById('toggleRed').checked;
|
| 376 |
svg.selectAll('.link.cand').style('display', show ? null : 'none');
|
| 377 |
svg.selectAll('.node.cand').style('display', show ? null : 'none');
|
| 378 |
+
svg.selectAll('.link-label').filter(d => d.cand).style('display', show ? null : 'none');
|
|
|
|
|
|
|
| 379 |
}
|
|
|
|
| 380 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
| 381 |
|
| 382 |
+
const HF_LOGO_URI = "__HF_LOGO_DATA_URI__";
|
| 383 |
const graph = __GRAPH_DATA__;
|
| 384 |
const W = innerWidth, H = innerHeight;
|
| 385 |
const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
|
|
|
|
| 402 |
.attr('class', d => `node ${d.cls}`)
|
| 403 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
| 404 |
|
| 405 |
+
const baseSel = node.filter(d => d.cls === 'base');
|
| 406 |
+
if (HF_LOGO_URI){
|
| 407 |
+
baseSel.append('image').attr('href', HF_LOGO_URI);
|
| 408 |
+
}else{
|
| 409 |
+
baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
|
| 410 |
+
}
|
| 411 |
+
node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
|
| 412 |
+
|
| 413 |
node.append('text').attr('class','node-label').attr('dy','-2.4em').text(d => d.id);
|
| 414 |
|
| 415 |
const sim = d3.forceSimulation(graph.nodes)
|
| 416 |
+
.force('link', d3.forceLink(graph.links).id(d => d.id).distance(520))
|
| 417 |
+
.force('charge', d3.forceManyBody().strength(-600))
|
| 418 |
.force('center', d3.forceCenter(W / 2, H / 2))
|
| 419 |
+
.force('collide', d3.forceCollide(d => 50));
|
|
|
|
| 420 |
|
| 421 |
sim.on('tick', () => {
|
| 422 |
link.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y)
|