icebear0828 Claude Opus 4.6 commited on
Commit
7cc27d8
·
1 Parent(s): eea9e24

fix: tighten JS detection regex in extract-fingerprint to catch more minified patterns

Browse files

Change quantifier from * to + so bare identifier lines in markdown
are not falsely matched, while consecutive punctuation sequences
like backtick-comma-paren are still caught as minified JS.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. scripts/extract-fingerprint.ts +490 -0
scripts/extract-fingerprint.ts ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env tsx
2
+ /**
3
+ * extract-fingerprint.ts — Extracts key fingerprint values from a Codex Desktop
4
+ * installation (macOS .app or Windows extracted ASAR).
5
+ *
6
+ * Usage:
7
+ * npx tsx scripts/extract-fingerprint.ts --path "C:/path/to/Codex" [--asar-out ./asar-out]
8
+ *
9
+ * The path can point to:
10
+ * - A macOS .app bundle (Codex.app)
11
+ * - A directory containing an already-extracted ASAR (with package.json and .vite/build/main.js)
12
+ * - A Windows install dir containing resources/app.asar
13
+ */
14
+
15
+ import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "fs";
16
+ import { resolve, join } from "path";
17
+ import { createHash } from "crypto";
18
+ import { execSync } from "child_process";
19
+ import yaml from "js-yaml";
20
+
21
+ const ROOT = resolve(import.meta.dirname, "..");
22
+ const OUTPUT_PATH = resolve(ROOT, "data/extracted-fingerprint.json");
23
+ const PROMPTS_DIR = resolve(ROOT, "data/extracted-prompts");
24
+ const PATTERNS_PATH = resolve(ROOT, "config/extraction-patterns.yaml");
25
+
26
+ interface ExtractionPatterns {
27
+ package_json: { version_key: string; build_number_key: string; sparkle_feed_key: string };
28
+ main_js: Record<string, {
29
+ pattern?: string;
30
+ group?: number;
31
+ global?: boolean;
32
+ start_marker?: string;
33
+ end_marker?: string;
34
+ end_pattern?: string;
35
+ description: string;
36
+ }>;
37
+ }
38
+
39
+ interface ExtractedFingerprint {
40
+ app_version: string;
41
+ build_number: string;
42
+ api_base_url: string | null;
43
+ originator: string | null;
44
+ models: string[];
45
+ wham_endpoints: string[];
46
+ user_agent_contains: string;
47
+ sparkle_feed_url: string | null;
48
+ prompts: {
49
+ desktop_context_hash: string | null;
50
+ desktop_context_path: string | null;
51
+ title_generation_hash: string | null;
52
+ title_generation_path: string | null;
53
+ pr_generation_hash: string | null;
54
+ pr_generation_path: string | null;
55
+ automation_response_hash: string | null;
56
+ automation_response_path: string | null;
57
+ };
58
+ extracted_at: string;
59
+ source_path: string;
60
+ }
61
+
62
+ function sha256(content: string): string {
63
+ return `sha256:${createHash("sha256").update(content, "utf-8").digest("hex").slice(0, 16)}`;
64
+ }
65
+
66
+ function loadPatterns(): ExtractionPatterns {
67
+ const raw = yaml.load(readFileSync(PATTERNS_PATH, "utf-8")) as ExtractionPatterns;
68
+ return raw;
69
+ }
70
+
71
+ /**
72
+ * Find the extracted ASAR root given an input path.
73
+ * Tries multiple layout conventions.
74
+ */
75
+ function findAsarRoot(inputPath: string): string {
76
+ // Direct: path has package.json (already extracted)
77
+ if (existsSync(join(inputPath, "package.json"))) {
78
+ return inputPath;
79
+ }
80
+
81
+ // macOS .app bundle
82
+ const macResources = join(inputPath, "Contents/Resources");
83
+ if (existsSync(join(macResources, "app.asar"))) {
84
+ return extractAsar(join(macResources, "app.asar"));
85
+ }
86
+
87
+ // Windows: resources/app.asar
88
+ const winResources = join(inputPath, "resources");
89
+ if (existsSync(join(winResources, "app.asar"))) {
90
+ return extractAsar(join(winResources, "app.asar"));
91
+ }
92
+
93
+ // Already extracted: check for nested 'extracted' dir
94
+ const extractedDir = join(inputPath, "extracted");
95
+ if (existsSync(join(extractedDir, "package.json"))) {
96
+ return extractedDir;
97
+ }
98
+
99
+ // Check recovered/extracted pattern
100
+ const recoveredExtracted = join(inputPath, "recovered/extracted");
101
+ if (existsSync(join(recoveredExtracted, "package.json"))) {
102
+ return recoveredExtracted;
103
+ }
104
+
105
+ throw new Error(
106
+ `Cannot find Codex source at ${inputPath}. Expected package.json or app.asar.`
107
+ );
108
+ }
109
+
110
+ function extractAsar(asarPath: string): string {
111
+ const outDir = resolve(ROOT, ".asar-out");
112
+ console.log(`[extract] Extracting ASAR: ${asarPath} → ${outDir}`);
113
+ execSync(`npx @electron/asar extract "${asarPath}" "${outDir}"`, {
114
+ stdio: "inherit",
115
+ });
116
+ return outDir;
117
+ }
118
+
119
+ /**
120
+ * Step A: Extract from package.json
121
+ */
122
+ function extractFromPackageJson(root: string): {
123
+ version: string;
124
+ buildNumber: string;
125
+ sparkleFeedUrl: string | null;
126
+ } {
127
+ const pkgPath = join(root, "package.json");
128
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
129
+
130
+ return {
131
+ version: pkg.version ?? "unknown",
132
+ buildNumber: String(pkg.codexBuildNumber ?? "unknown"),
133
+ sparkleFeedUrl: pkg.codexSparkleFeedUrl ?? null,
134
+ };
135
+ }
136
+
137
+ /**
138
+ * Step B: Extract values from main.js using patterns
139
+ */
140
+ function extractFromMainJs(
141
+ content: string,
142
+ patterns: ExtractionPatterns["main_js"],
143
+ ): {
144
+ apiBaseUrl: string | null;
145
+ originator: string | null;
146
+ models: string[];
147
+ whamEndpoints: string[];
148
+ userAgentContains: string;
149
+ } {
150
+ // API base URL
151
+ let apiBaseUrl: string | null = null;
152
+ const apiPattern = patterns.api_base_url;
153
+ if (apiPattern?.pattern) {
154
+ const m = content.match(new RegExp(apiPattern.pattern));
155
+ if (m) apiBaseUrl = m[0];
156
+ }
157
+
158
+ // Fail fast on critical fields
159
+ if (!apiBaseUrl) {
160
+ console.error("[extract] CRITICAL: Failed to extract API base URL from main.js");
161
+ console.error("[extract] The extraction pattern may need updating for this version.");
162
+ throw new Error("Failed to extract critical field: api_base_url");
163
+ }
164
+
165
+ // Originator
166
+ let originator: string | null = null;
167
+ const origPattern = patterns.originator;
168
+ if (origPattern?.pattern) {
169
+ const m = content.match(new RegExp(origPattern.pattern));
170
+ if (m) originator = m[origPattern.group ?? 0] ?? m[0];
171
+ }
172
+
173
+ // Fail fast on critical fields
174
+ if (!originator) {
175
+ console.error("[extract] CRITICAL: Failed to extract originator from main.js");
176
+ console.error("[extract] The extraction pattern may need updating for this version.");
177
+ throw new Error("Failed to extract critical field: originator");
178
+ }
179
+
180
+ // Models — deduplicate, use capture group if specified
181
+ const models: Set<string> = new Set();
182
+ const modelPattern = patterns.models;
183
+ if (modelPattern?.pattern) {
184
+ const re = new RegExp(modelPattern.pattern, "g");
185
+ const groupIdx = modelPattern.group ?? 0;
186
+ for (const m of content.matchAll(re)) {
187
+ models.add(m[groupIdx] ?? m[0]);
188
+ }
189
+ }
190
+
191
+ // WHAM endpoints — deduplicate, use capture group if specified
192
+ const endpoints: Set<string> = new Set();
193
+ const epPattern = patterns.wham_endpoints;
194
+ if (epPattern?.pattern) {
195
+ const re = new RegExp(epPattern.pattern, "g");
196
+ const epGroupIdx = epPattern.group ?? 0;
197
+ for (const m of content.matchAll(re)) {
198
+ endpoints.add(m[epGroupIdx] ?? m[0]);
199
+ }
200
+ }
201
+
202
+ return {
203
+ apiBaseUrl,
204
+ originator,
205
+ models: [...models].sort(),
206
+ whamEndpoints: [...endpoints].sort(),
207
+ userAgentContains: "Codex Desktop/",
208
+ };
209
+ }
210
+
211
+ /**
212
+ * Step B (continued): Extract system prompts from main.js
213
+ */
214
+ function extractPrompts(content: string): {
215
+ desktopContext: string | null;
216
+ titleGeneration: string | null;
217
+ prGeneration: string | null;
218
+ automationResponse: string | null;
219
+ } {
220
+ // Desktop context: from "# Codex desktop context" to the end of the template literal.
221
+ // In minified code the closing backtick may be followed by `,` `;` or `)` — simple
222
+ // indexOf("`;") can match the wrong position. Instead, walk line-by-line and stop
223
+ // at the first line that looks like minified JS (identifier assignment, JS keyword).
224
+ let desktopContext: string | null = null;
225
+ const dcStart = content.indexOf("# Codex desktop context");
226
+ if (dcStart !== -1) {
227
+ const remaining = content.slice(dcStart);
228
+ const lines = remaining.split("\n");
229
+ const cleanLines: string[] = [];
230
+ for (const line of lines) {
231
+ // Detect minified JS: consecutive punctuation/whitespace followed by identifier assignment
232
+ if (/^[`,;)\]}\s]+[A-Za-z_$]/.test(line)) break;
233
+ if (/^[`'";}\])\s]*(?:async\s+)?(?:function|class|const|let|var|return|throw|if|for|while)\b/.test(line)) break;
234
+ cleanLines.push(line);
235
+ }
236
+ if (cleanLines.length > 0) {
237
+ cleanLines[cleanLines.length - 1] = cleanLines[cleanLines.length - 1].replace(/`\s*$/, "");
238
+ }
239
+ desktopContext = cleanLines.join("\n").trim() || null;
240
+ }
241
+
242
+ // Title generation: from the function that builds the array
243
+ let titleGeneration: string | null = null;
244
+ const titleMarker = "You are a helpful assistant. You will be presented with a user prompt";
245
+ const titleStart = content.indexOf(titleMarker);
246
+ if (titleStart !== -1) {
247
+ // Find the enclosing array end: ].join(
248
+ const joinIdx = content.indexOf("].join(", titleStart);
249
+ if (joinIdx !== -1) {
250
+ // Extract the array content between [ and ]
251
+ const bracketStart = content.lastIndexOf("[", titleStart);
252
+ if (bracketStart !== -1) {
253
+ const arrayContent = content.slice(bracketStart + 1, joinIdx);
254
+ // Parse string literals from the array
255
+ titleGeneration = parseStringArray(arrayContent);
256
+ }
257
+ }
258
+ }
259
+
260
+ // PR generation
261
+ let prGeneration: string | null = null;
262
+ const prMarker = "You are a helpful assistant. Generate a pull request title";
263
+ const prStart = content.indexOf(prMarker);
264
+ if (prStart !== -1) {
265
+ const joinIdx = content.indexOf("].join(", prStart);
266
+ if (joinIdx !== -1) {
267
+ const bracketStart = content.lastIndexOf("[", prStart);
268
+ if (bracketStart !== -1) {
269
+ const arrayContent = content.slice(bracketStart + 1, joinIdx);
270
+ prGeneration = parseStringArray(arrayContent);
271
+ }
272
+ }
273
+ }
274
+
275
+ // Automation response: template literal starting with "Response MUST end with"
276
+ let automationResponse: string | null = null;
277
+ const autoMarker = "Response MUST end with a remark-directive block";
278
+ const autoStart = content.indexOf(autoMarker);
279
+ if (autoStart !== -1) {
280
+ const autoRemaining = content.slice(autoStart);
281
+ const autoLines = autoRemaining.split("\n");
282
+ const autoClean: string[] = [];
283
+ for (const line of autoLines) {
284
+ if (/^[`,;)\]}\s]+[A-Za-z_$]/.test(line)) break;
285
+ if (/^[`'";}\])\s]*(?:async\s+)?(?:function|class|const|let|var|return|throw|if|for|while)\b/.test(line)) break;
286
+ autoClean.push(line);
287
+ }
288
+ if (autoClean.length > 0) {
289
+ autoClean[autoClean.length - 1] = autoClean[autoClean.length - 1].replace(/`\s*$/, "");
290
+ }
291
+ automationResponse = autoClean.join("\n").trim() || null;
292
+ }
293
+
294
+ return { desktopContext, titleGeneration, prGeneration, automationResponse };
295
+ }
296
+
297
+ /**
298
+ * Parse a JavaScript string array content into a single joined string.
299
+ * Handles simple quoted strings separated by commas.
300
+ */
301
+ function parseStringArray(arrayContent: string): string {
302
+ const lines: string[] = [];
303
+ // Match quoted strings (both single and double quotes) and template literals
304
+ const stringRe = /"((?:[^"\\]|\\.)*)"|'((?:[^'\\]|\\.)*)'/g;
305
+ for (const m of arrayContent.matchAll(stringRe)) {
306
+ const str = m[1] ?? m[2] ?? "";
307
+ // Unescape common sequences
308
+ lines.push(
309
+ str
310
+ .replace(/\\n/g, "\n")
311
+ .replace(/\\t/g, "\t")
312
+ .replace(/\\"/g, '"')
313
+ .replace(/\\'/g, "'")
314
+ .replace(/\\\\/g, "\\")
315
+ );
316
+ }
317
+ return lines.join("\n");
318
+ }
319
+
320
+ /** Safety net: strip any trailing minified JS that slipped through extraction. */
321
+ function sanitizePrompt(raw: string): string {
322
+ const lines = raw.split("\n");
323
+ const clean: string[] = [];
324
+ for (const line of lines) {
325
+ if (/^[`,;)\]}\s]*[A-Za-z_$][A-Za-z0-9_$]*\s*=/.test(line)) break;
326
+ if (/^[`'";}\])\s]*(?:async\s+)?(?:function|class|const|let|var|return|throw|if|for|while)\b/.test(line)) break;
327
+ clean.push(line);
328
+ }
329
+ if (clean.length > 0) {
330
+ clean[clean.length - 1] = clean[clean.length - 1].replace(/`\s*$/, "");
331
+ }
332
+ return clean.join("\n").trim();
333
+ }
334
+
335
+ function savePrompt(name: string, content: string | null): { hash: string | null; path: string | null } {
336
+ if (!content) return { hash: null, path: null };
337
+
338
+ const sanitized = sanitizePrompt(content);
339
+ if (!sanitized) return { hash: null, path: null };
340
+
341
+ mkdirSync(PROMPTS_DIR, { recursive: true });
342
+ const filePath = join(PROMPTS_DIR, `${name}.md`);
343
+ writeFileSync(filePath, sanitized);
344
+
345
+ return {
346
+ hash: sha256(content),
347
+ path: filePath,
348
+ };
349
+ }
350
+
351
+ async function main() {
352
+ // Parse --path argument
353
+ const pathIdx = process.argv.indexOf("--path");
354
+ if (pathIdx === -1 || !process.argv[pathIdx + 1]) {
355
+ console.error("Usage: npx tsx scripts/extract-fingerprint.ts --path <codex-path>");
356
+ console.error("");
357
+ console.error(" <codex-path> can be:");
358
+ console.error(" - macOS: /path/to/Codex.app");
359
+ console.error(" - Windows: C:/path/to/Codex (containing resources/app.asar)");
360
+ console.error(" - Extracted: directory with package.json and .vite/build/main.js");
361
+ process.exit(1);
362
+ }
363
+
364
+ const inputPath = resolve(process.argv[pathIdx + 1]);
365
+ console.log(`[extract] Input: ${inputPath}`);
366
+
367
+ // Find ASAR root
368
+ const asarRoot = findAsarRoot(inputPath);
369
+ console.log(`[extract] ASAR root: ${asarRoot}`);
370
+
371
+ // Load extraction patterns
372
+ const patterns = loadPatterns();
373
+
374
+ // Step A: package.json
375
+ console.log("[extract] Reading package.json...");
376
+ const { version, buildNumber, sparkleFeedUrl } = extractFromPackageJson(asarRoot);
377
+ console.log(` version: ${version}`);
378
+ console.log(` build: ${buildNumber}`);
379
+
380
+ // Step B: main.js (or main-XXXXX.js chunk)
381
+ console.log("[extract] Loading main.js...");
382
+ const mainJs = await (async () => {
383
+ const buildDir = join(asarRoot, ".vite/build");
384
+ // Find the main JS: prefer main-*.js chunk (Vite code-split), fall back to main.js
385
+ let mainPath = join(buildDir, "main.js");
386
+ if (existsSync(buildDir)) {
387
+ const files = readdirSync(buildDir);
388
+ const chunk = files.find((f) => /^main-[A-Za-z0-9]+\.js$/.test(f));
389
+ if (chunk) {
390
+ mainPath = join(buildDir, chunk);
391
+ console.log(`[extract] Found chunk: ${chunk}`);
392
+ }
393
+ }
394
+ if (!existsSync(mainPath)) {
395
+ console.warn("[extract] main.js not found, skipping JS extraction");
396
+ return null;
397
+ }
398
+
399
+ const content = readFileSync(mainPath, "utf-8");
400
+ const lineCount = content.split("\n").length;
401
+
402
+ if (lineCount < 100 && content.length > 100000) {
403
+ console.log("[extract] main.js appears minified, attempting beautify...");
404
+ try {
405
+ const jsBeautify = await import("js-beautify");
406
+ return jsBeautify.default.js(content, { indent_size: 2 });
407
+ } catch {
408
+ console.warn("[extract] js-beautify not available, using raw content");
409
+ return content;
410
+ }
411
+ }
412
+ return content;
413
+ })();
414
+
415
+ let mainJsResults = {
416
+ apiBaseUrl: null as string | null,
417
+ originator: null as string | null,
418
+ models: [] as string[],
419
+ whamEndpoints: [] as string[],
420
+ userAgentContains: "Codex Desktop/",
421
+ };
422
+
423
+ let promptResults = {
424
+ desktopContext: null as string | null,
425
+ titleGeneration: null as string | null,
426
+ prGeneration: null as string | null,
427
+ automationResponse: null as string | null,
428
+ };
429
+
430
+ if (mainJs) {
431
+ console.log(`[extract] main.js loaded (${mainJs.split("\n").length} lines)`);
432
+
433
+ mainJsResults = extractFromMainJs(mainJs, patterns.main_js);
434
+ console.log(` API base URL: ${mainJsResults.apiBaseUrl}`);
435
+ console.log(` originator: ${mainJsResults.originator}`);
436
+ console.log(` models: ${mainJsResults.models.join(", ")}`);
437
+ console.log(` WHAM endpoints: ${mainJsResults.whamEndpoints.length} found`);
438
+
439
+ // Extract system prompts
440
+ console.log("[extract] Extracting system prompts...");
441
+ promptResults = extractPrompts(mainJs);
442
+ console.log(` desktop-context: ${promptResults.desktopContext ? "found" : "NOT FOUND"}`);
443
+ console.log(` title-generation: ${promptResults.titleGeneration ? "found" : "NOT FOUND"}`);
444
+ console.log(` pr-generation: ${promptResults.prGeneration ? "found" : "NOT FOUND"}`);
445
+ console.log(` automation-response: ${promptResults.automationResponse ? "found" : "NOT FOUND"}`);
446
+ }
447
+
448
+ // Save extracted prompts
449
+ const dc = savePrompt("desktop-context", promptResults.desktopContext);
450
+ const tg = savePrompt("title-generation", promptResults.titleGeneration);
451
+ const pr = savePrompt("pr-generation", promptResults.prGeneration);
452
+ const ar = savePrompt("automation-response", promptResults.automationResponse);
453
+
454
+ // Build output
455
+ const fingerprint: ExtractedFingerprint = {
456
+ app_version: version,
457
+ build_number: buildNumber,
458
+ api_base_url: mainJsResults.apiBaseUrl,
459
+ originator: mainJsResults.originator,
460
+ models: mainJsResults.models,
461
+ wham_endpoints: mainJsResults.whamEndpoints,
462
+ user_agent_contains: mainJsResults.userAgentContains,
463
+ sparkle_feed_url: sparkleFeedUrl,
464
+ prompts: {
465
+ desktop_context_hash: dc.hash,
466
+ desktop_context_path: dc.path,
467
+ title_generation_hash: tg.hash,
468
+ title_generation_path: tg.path,
469
+ pr_generation_hash: pr.hash,
470
+ pr_generation_path: pr.path,
471
+ automation_response_hash: ar.hash,
472
+ automation_response_path: ar.path,
473
+ },
474
+ extracted_at: new Date().toISOString(),
475
+ source_path: inputPath,
476
+ };
477
+
478
+ // Write output
479
+ mkdirSync(resolve(ROOT, "data"), { recursive: true });
480
+ writeFileSync(OUTPUT_PATH, JSON.stringify(fingerprint, null, 2));
481
+
482
+ console.log(`\n[extract] Fingerprint written to ${OUTPUT_PATH}`);
483
+ console.log(`[extract] Prompts written to ${PROMPTS_DIR}/`);
484
+ console.log("[extract] Done.");
485
+ }
486
+
487
+ main().catch((err) => {
488
+ console.error("[extract] Fatal:", err);
489
+ process.exit(1);
490
+ });