|
|
#!/usr/bin/env node |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import fs from 'fs/promises' |
|
|
import path from 'path' |
|
|
|
|
|
function usage() { |
|
|
console.log(`Usage:\n node scripts/merge_csv.js --inputDir <dir> --out <out.csv>\n node scripts/merge_csv.js <file1.csv> <file2.csv> --out <out.csv>\n\nOptions:\n --inputDir 读取指定目录下的所有 .csv 文件(非递归)\n --out 输出文件路径(默认: merged.csv)\n --help 显示帮助\n`) |
|
|
} |
|
|
|
|
|
function parseArgs(argv) { |
|
|
const args = argv.slice(2) |
|
|
const opts = { files: [], inputDir: null, out: 'merged.csv' } |
|
|
for (let i = 0; i < args.length; i++) { |
|
|
const a = args[i] |
|
|
if (a === '--help' || a === '-h') { opts.help = true; break } |
|
|
if (a === '--inputDir') { opts.inputDir = args[++i]; continue } |
|
|
if (a === '--out') { opts.out = args[++i]; continue } |
|
|
if (a.startsWith('--')) { |
|
|
console.warn('Unknown option', a) |
|
|
continue |
|
|
} |
|
|
opts.files.push(a) |
|
|
} |
|
|
return opts |
|
|
} |
|
|
|
|
|
|
|
|
function parseCSVLine(line) { |
|
|
const res = [] |
|
|
let cur = '' |
|
|
let inQuotes = false |
|
|
for (let i = 0; i < line.length; i++) { |
|
|
const ch = line[i] |
|
|
if (inQuotes) { |
|
|
if (ch === '"') { |
|
|
if (i + 1 < line.length && line[i + 1] === '"') { |
|
|
cur += '"' |
|
|
i++ |
|
|
} else { |
|
|
inQuotes = false |
|
|
} |
|
|
} else { |
|
|
cur += ch |
|
|
} |
|
|
} else { |
|
|
if (ch === ',') { |
|
|
res.push(cur) |
|
|
cur = '' |
|
|
} else if (ch === '"') { |
|
|
inQuotes = true |
|
|
} else { |
|
|
cur += ch |
|
|
} |
|
|
} |
|
|
} |
|
|
res.push(cur) |
|
|
return res |
|
|
} |
|
|
|
|
|
function csvEscape(value) { |
|
|
if (value == null) return '' |
|
|
const s = String(value) |
|
|
if (s.includes('"')) return '"' + s.replace(/"/g, '""') + '"' |
|
|
if (s.includes(',') || s.includes('\n') || s.includes('\r')) return '"' + s + '"' |
|
|
return s |
|
|
} |
|
|
|
|
|
async function readCSVFile(filePath) { |
|
|
const txt = await fs.readFile(filePath, 'utf8') |
|
|
|
|
|
const lines = txt.split(/\r?\n/) |
|
|
|
|
|
let headerLineIndex = null |
|
|
for (let i = 0; i < lines.length; i++) { |
|
|
if (lines[i].trim().length > 0) { headerLineIndex = i; break } |
|
|
} |
|
|
if (headerLineIndex === null) return { headers: [], rows: [] } |
|
|
const headers = parseCSVLine(lines[headerLineIndex]) |
|
|
const rows = [] |
|
|
for (let i = headerLineIndex + 1; i < lines.length; i++) { |
|
|
const l = lines[i] |
|
|
if (l == null || l.trim() === '') continue |
|
|
const vals = parseCSVLine(l) |
|
|
const obj = {} |
|
|
for (let j = 0; j < headers.length; j++) { |
|
|
obj[headers[j]] = vals[j] ?? '' |
|
|
} |
|
|
rows.push(obj) |
|
|
} |
|
|
return { headers, rows } |
|
|
} |
|
|
|
|
|
async function main() { |
|
|
const opts = parseArgs(process.argv) |
|
|
if (opts.help) { usage(); return } |
|
|
|
|
|
const cwd = process.cwd() |
|
|
let files = [] |
|
|
if (opts.inputDir) { |
|
|
const dir = path.isAbsolute(opts.inputDir) ? opts.inputDir : path.join(cwd, opts.inputDir) |
|
|
try { |
|
|
const names = await fs.readdir(dir) |
|
|
files = names.filter(n => n.toLowerCase().endsWith('.csv')).map(n => path.join(dir, n)) |
|
|
} catch (e) { |
|
|
console.error('Failed to read inputDir', e.message) |
|
|
process.exit(2) |
|
|
} |
|
|
} |
|
|
if (opts.files && opts.files.length) { |
|
|
const explicit = opts.files.map(f => path.isAbsolute(f) ? f : path.join(cwd, f)) |
|
|
files = files.concat(explicit) |
|
|
} |
|
|
|
|
|
files = [...new Set(files)] |
|
|
if (files.length === 0) { |
|
|
console.error('No CSV files specified. Use --inputDir or pass file paths.') |
|
|
usage(); |
|
|
process.exit(1) |
|
|
} |
|
|
|
|
|
const allRows = [] |
|
|
const headerOrder = [] |
|
|
const headerSet = new Set() |
|
|
|
|
|
|
|
|
const fileDatas = [] |
|
|
for (const f of files) { |
|
|
try { |
|
|
const stat = await fs.stat(f) |
|
|
if (!stat.isFile()) { console.warn('Skipping (not a file):', f); continue } |
|
|
} catch (e) { console.warn('Skipping (not found):', f); continue } |
|
|
const { headers, rows } = await readCSVFile(f) |
|
|
const base = path.basename(f) |
|
|
const nameNoExt = base.replace(/\.[^/.]+$/, '') |
|
|
const lastChar = nameNoExt.slice(-1) |
|
|
fileDatas.push({ path: f, headers, rows, nameNoExt, lastChar }) |
|
|
} |
|
|
|
|
|
|
|
|
let file3Set = null |
|
|
const file3 = fileDatas.find(d => d.lastChar === '3') |
|
|
if (file3) { |
|
|
const firstHdr = file3.headers && file3.headers.length > 0 ? file3.headers[0] : null |
|
|
file3Set = new Set() |
|
|
if (firstHdr) { |
|
|
for (const r of file3.rows) { |
|
|
const v = r[firstHdr] |
|
|
if (v != null) file3Set.add(String(v)) |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
const removedRows = [] |
|
|
for (const d of fileDatas) { |
|
|
const { headers, rows, lastChar } = d |
|
|
for (const h of headers) { |
|
|
if (!headerSet.has(h)) { |
|
|
headerSet.add(h) |
|
|
headerOrder.push(h) |
|
|
} |
|
|
} |
|
|
if (lastChar === '0' && file3Set) { |
|
|
|
|
|
const firstHdr = headers && headers.length > 0 ? headers[0] : null |
|
|
if (!firstHdr) continue |
|
|
for (const r of rows) { |
|
|
const val = r[firstHdr] |
|
|
if (val != null && file3Set.has(String(val))) { |
|
|
allRows.push(r) |
|
|
} else { |
|
|
|
|
|
removedRows.push({ source: d.path, key: firstHdr, value: val, row: r }) |
|
|
} |
|
|
} |
|
|
} else { |
|
|
for (const r of rows) allRows.push(r) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
const outRows = allRows.map(r => { |
|
|
const o = {} |
|
|
for (const h of headerOrder) o[h] = (h in r) ? r[h] : '' |
|
|
|
|
|
for (const k of Object.keys(r)) if (!headerSet.has(k)) { headerSet.add(k); headerOrder.push(k); o[k] = r[k] } |
|
|
return o |
|
|
}) |
|
|
|
|
|
|
|
|
let outPath = path.isAbsolute(opts.out) ? opts.out : path.join(cwd, opts.out) |
|
|
|
|
|
const looksLikeDir = opts.out.endsWith('/') || opts.out.endsWith('\\') |
|
|
if (looksLikeDir) { |
|
|
outPath = path.join(outPath, 'merged.csv') |
|
|
} |
|
|
|
|
|
try { |
|
|
const st = await fs.stat(outPath) |
|
|
if (st.isDirectory()) { |
|
|
outPath = path.join(outPath, 'merged.csv') |
|
|
} |
|
|
} catch (e) { |
|
|
|
|
|
} |
|
|
|
|
|
const headerLine = headerOrder.map(csvEscape).join(',') |
|
|
const lines = [headerLine] |
|
|
for (const r of outRows) { |
|
|
const vals = headerOrder.map(h => csvEscape(r[h])) |
|
|
lines.push(vals.join(',')) |
|
|
} |
|
|
await fs.mkdir(path.dirname(outPath), { recursive: true }) |
|
|
await fs.writeFile(outPath, lines.join('\n'), 'utf8') |
|
|
console.log(`Wrote merged CSV to ${outPath} (${outRows.length} rows, ${headerOrder.length} columns)`) |
|
|
|
|
|
|
|
|
if (removedRows.length > 0) { |
|
|
console.log(`Removed ${removedRows.length} rows from files (not present in file3). Logging to removed_rows.log`) |
|
|
const logLines = [] |
|
|
logLines.push(`Removed ${removedRows.length} rows - details:`) |
|
|
for (const it of removedRows) { |
|
|
logLines.push(`source=${it.source} ${it.key}=${it.value} row=${JSON.stringify(it.row)}`) |
|
|
} |
|
|
|
|
|
const logPath = path.join(path.dirname(outPath), 'removed_rows.log') |
|
|
await fs.writeFile(logPath, logLines.join('\n'), 'utf8') |
|
|
for (let i = 0; i < Math.min(50, logLines.length); i++) console.log(logLines[i]) |
|
|
if (logLines.length > 50) console.log(`... see ${logPath} for full log`) |
|
|
} else { |
|
|
console.log('No rows were removed by file3 filtering.') |
|
|
} |
|
|
} |
|
|
|
|
|
main().catch(err => { |
|
|
console.error('Error:', err && err.stack ? err.stack : err) |
|
|
process.exit(3) |
|
|
}) |
|
|
|