open-chatbot / src /lib /file-processor.ts
romizone's picture
Upload folder using huggingface_hub
c730f0b verified
/* eslint-disable @typescript-eslint/no-require-imports */
import { IMAGE_EXTENSIONS, TEXT_EXTENSIONS, SUPPORTED_EXTENSIONS } from "./constants";
import type { FileContext } from "./types";
import { v4 as uuidv4 } from "uuid";
function getExtension(filename: string): string {
const parts = filename.split(".");
return parts.length > 1 ? parts.pop()!.toLowerCase() : "";
}
export async function processFile(
buffer: Buffer,
filename: string
): Promise<FileContext> {
const ext = getExtension(filename);
const result: FileContext = {
id: uuidv4(),
filename,
extension: ext,
text: "",
error: null,
size: buffer.length,
};
if (!SUPPORTED_EXTENSIONS.includes(ext)) {
result.error = `Format '.${ext}' belum didukung.`;
return result;
}
try {
if (ext === "pdf") {
result.text = await processPdf(buffer);
} else if (ext === "doc") {
result.text = await processDoc(buffer);
} else if (ext === "docx") {
result.text = await processDocx(buffer);
} else if (ext === "xlsx" || ext === "xls") {
result.text = processExcel(buffer);
} else if (ext === "csv") {
result.text = processCsv(buffer);
} else if (IMAGE_EXTENSIONS.includes(ext)) {
result.text = await processImage(buffer);
} else if (TEXT_EXTENSIONS.includes(ext)) {
result.text = processText(buffer);
} else {
result.text = processText(buffer);
}
} catch (e: unknown) {
result.error = `Error memproses '${filename}': ${e instanceof Error ? e.message : String(e)}`;
}
return result;
}
async function processPdf(buffer: Buffer): Promise<string> {
const { writeFile, readFile } = require("fs/promises");
const { mkdtemp, rm } = require("fs/promises");
const { tmpdir } = require("os");
const path = require("path");
const { execFile } = require("child_process");
// Step 1: Try fast text extraction with pdftotext CLI (poppler)
const tmpDir = await mkdtemp(path.join(tmpdir(), "pdf-txt-"));
const pdfPath = path.join(tmpDir, "input.pdf");
const txtPath = path.join(tmpDir, "output.txt");
try {
await writeFile(pdfPath, buffer);
await new Promise<void>((resolve, reject) => {
execFile(
"pdftotext",
["-layout", pdfPath, txtPath],
{ timeout: 15000 },
(error: Error | null) => {
if (error) reject(error);
else resolve();
}
);
});
const text = (await readFile(txtPath, "utf-8")).trim();
console.log(`[pdf] pdftotext extracted ${text.length} chars`);
if (text.length > 50) {
return text;
}
} catch (e) {
console.log(`[pdf] pdftotext failed: ${e instanceof Error ? e.message : String(e)}`);
} finally {
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
}
// Step 2: Fallback — convert PDF pages to images with pdftoppm, then OCR
console.log("[pdf] Text extraction empty, starting OCR fallback...");
return await ocrPdf(buffer);
}
async function ocrPdf(buffer: Buffer): Promise<string> {
const { writeFile, readFile, readdir } = require("fs/promises");
const { mkdtemp, rm } = require("fs/promises");
const { tmpdir } = require("os");
const path = require("path");
const { execFile } = require("child_process");
const tmpDir = await mkdtemp(path.join(tmpdir(), "pdf-ocr-"));
const pdfPath = path.join(tmpDir, "input.pdf");
try {
await writeFile(pdfPath, buffer);
// Convert PDF to PNG images using pdftoppm (poppler)
// Always limit to 20 pages max to avoid excessive processing
const args = ["-png", "-r", "300", "-l", "20", pdfPath, path.join(tmpDir, "page")];
await new Promise<void>((resolve, reject) => {
execFile(
"pdftoppm",
args,
{ timeout: 120000 },
(error: Error | null) => {
if (error) reject(error);
else resolve();
}
);
});
// Find all generated page images
const files = await readdir(tmpDir);
const pageFiles = files
.filter((f: string) => f.startsWith("page") && f.endsWith(".png"))
.sort();
if (pageFiles.length === 0) {
return "(PDF berisi gambar tapi tidak dapat di-OCR)";
}
// OCR each page
const results: string[] = [];
for (const pageFile of pageFiles) {
const imgPath = path.join(tmpDir, pageFile);
const ocrBase = path.join(tmpDir, `ocr-${pageFile}`);
const ocrPath = ocrBase + ".txt";
try {
await new Promise<void>((resolve, reject) => {
execFile(
"tesseract",
[imgPath, ocrBase, "-l", "eng+ind"],
{ timeout: 60000 },
(error: Error | null) => {
if (error) reject(error);
else resolve();
}
);
});
const pageText = await readFile(ocrPath, "utf-8");
if (pageText.trim()) {
results.push(`--- Halaman ${results.length + 1} ---\n${pageText.trim()}`);
}
} catch {
// Skip pages that fail OCR
}
}
return results.length > 0
? results.join("\n\n")
: "(PDF berisi gambar tapi tidak dapat di-OCR)";
} finally {
// Clean up temp directory
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
}
}
async function processDoc(buffer: Buffer): Promise<string> {
const { writeFile, unlink } = require("fs/promises");
const { tmpdir } = require("os");
const path = require("path");
const tmpPath = path.join(tmpdir(), `doc-${uuidv4()}.doc`);
try {
await writeFile(tmpPath, buffer);
const WordExtractor = require("word-extractor");
const extractor = new WordExtractor();
const doc = await extractor.extract(tmpPath);
return doc.getBody().trim();
} finally {
await unlink(tmpPath).catch(() => {});
}
}
async function processDocx(buffer: Buffer): Promise<string> {
const mammoth = await import("mammoth");
const result = await mammoth.extractRawText({ buffer });
return result.value.trim();
}
function processExcel(buffer: Buffer): string {
const XLSX = require("xlsx");
const workbook = XLSX.read(buffer, { type: "buffer" });
const texts: string[] = [];
for (const sheetName of workbook.SheetNames) {
const sheet = workbook.Sheets[sheetName];
const csv = XLSX.utils.sheet_to_csv(sheet);
texts.push(`--- Sheet: ${sheetName} ---\n${csv}`);
}
return texts.join("\n\n");
}
function processCsv(buffer: Buffer): string {
const XLSX = require("xlsx");
const workbook = XLSX.read(buffer, { type: "buffer" });
const sheet = workbook.Sheets[workbook.SheetNames[0]];
return XLSX.utils.sheet_to_csv(sheet);
}
async function processImage(buffer: Buffer): Promise<string> {
const { writeFile, readFile, unlink } = require("fs/promises");
const { tmpdir } = require("os");
const path = require("path");
const { execFile } = require("child_process");
const inputPath = path.join(tmpdir(), `ocr-${uuidv4()}`);
const outputBase = path.join(tmpdir(), `ocr-out-${uuidv4()}`);
const outputPath = outputBase + ".txt";
try {
await writeFile(inputPath, buffer);
// Use system tesseract CLI — avoids Turbopack module resolution issues
await new Promise<void>((resolve, reject) => {
execFile(
"tesseract",
[inputPath, outputBase, "-l", "eng+ind"],
{ timeout: 60000 },
(error: Error | null) => {
if (error) reject(error);
else resolve();
}
);
});
const text = await readFile(outputPath, "utf-8");
return text.trim();
} finally {
await unlink(inputPath).catch(() => {});
await unlink(outputPath).catch(() => {});
}
}
function processText(buffer: Buffer): string {
try {
return buffer.toString("utf-8");
} catch {
return buffer.toString("latin1");
}
}