Spaces:

romizone
/

open-chatbot

Sleeping

App Files Files Community

open-chatbot / src /lib /file-processor.ts

romizone

Upload folder using huggingface_hub

c730f0b verified 21 days ago

raw

history blame contribute delete

7.83 kB

	/* eslint-disable @typescript-eslint/no-require-imports */
	import { IMAGE_EXTENSIONS, TEXT_EXTENSIONS, SUPPORTED_EXTENSIONS } from "./constants";
	import type { FileContext } from "./types";
	import { v4 as uuidv4 } from "uuid";

	function getExtension(filename: string): string {
	const parts = filename.split(".");
	return parts.length > 1 ? parts.pop()!.toLowerCase() : "";
	}

	export async function processFile(
	buffer: Buffer,
	filename: string
	): Promise<FileContext> {
	const ext = getExtension(filename);
	const result: FileContext = {
	id: uuidv4(),
	filename,
	extension: ext,
	text: "",
	error: null,
	size: buffer.length,
	};

	if (!SUPPORTED_EXTENSIONS.includes(ext)) {
	result.error = `Format '.${ext}' belum didukung.`;
	return result;
	}

	try {
	if (ext === "pdf") {
	result.text = await processPdf(buffer);
	} else if (ext === "doc") {
	result.text = await processDoc(buffer);
	} else if (ext === "docx") {
	result.text = await processDocx(buffer);
	} else if (ext === "xlsx" \|\| ext === "xls") {
	result.text = processExcel(buffer);
	} else if (ext === "csv") {
	result.text = processCsv(buffer);
	} else if (IMAGE_EXTENSIONS.includes(ext)) {
	result.text = await processImage(buffer);
	} else if (TEXT_EXTENSIONS.includes(ext)) {
	result.text = processText(buffer);
	} else {
	result.text = processText(buffer);
	}
	} catch (e: unknown) {
	result.error = `Error memproses '${filename}': ${e instanceof Error ? e.message : String(e)}`;
	}

	return result;
	}

	async function processPdf(buffer: Buffer): Promise<string> {
	const { writeFile, readFile } = require("fs/promises");
	const { mkdtemp, rm } = require("fs/promises");
	const { tmpdir } = require("os");
	const path = require("path");
	const { execFile } = require("child_process");

	// Step 1: Try fast text extraction with pdftotext CLI (poppler)
	const tmpDir = await mkdtemp(path.join(tmpdir(), "pdf-txt-"));
	const pdfPath = path.join(tmpDir, "input.pdf");
	const txtPath = path.join(tmpDir, "output.txt");

	try {
	await writeFile(pdfPath, buffer);

	await new Promise<void>((resolve, reject) => {
	execFile(
	"pdftotext",
	["-layout", pdfPath, txtPath],
	{ timeout: 15000 },
	(error: Error \| null) => {
	if (error) reject(error);
	else resolve();
	}
	);
	});

	const text = (await readFile(txtPath, "utf-8")).trim();
	console.log(`[pdf] pdftotext extracted ${text.length} chars`);

	if (text.length > 50) {
	return text;
	}
	} catch (e) {
	console.log(`[pdf] pdftotext failed: ${e instanceof Error ? e.message : String(e)}`);
	} finally {
	await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
	}

	// Step 2: Fallback — convert PDF pages to images with pdftoppm, then OCR
	console.log("[pdf] Text extraction empty, starting OCR fallback...");
	return await ocrPdf(buffer);
	}

	async function ocrPdf(buffer: Buffer): Promise<string> {
	const { writeFile, readFile, readdir } = require("fs/promises");
	const { mkdtemp, rm } = require("fs/promises");
	const { tmpdir } = require("os");
	const path = require("path");
	const { execFile } = require("child_process");

	const tmpDir = await mkdtemp(path.join(tmpdir(), "pdf-ocr-"));
	const pdfPath = path.join(tmpDir, "input.pdf");

	try {
	await writeFile(pdfPath, buffer);

	// Convert PDF to PNG images using pdftoppm (poppler)
	// Always limit to 20 pages max to avoid excessive processing
	const args = ["-png", "-r", "300", "-l", "20", pdfPath, path.join(tmpDir, "page")];
	await new Promise<void>((resolve, reject) => {
	execFile(
	"pdftoppm",
	args,
	{ timeout: 120000 },
	(error: Error \| null) => {
	if (error) reject(error);
	else resolve();
	}
	);
	});

	// Find all generated page images
	const files = await readdir(tmpDir);
	const pageFiles = files
	.filter((f: string) => f.startsWith("page") && f.endsWith(".png"))
	.sort();

	if (pageFiles.length === 0) {
	return "(PDF berisi gambar tapi tidak dapat di-OCR)";
	}

	// OCR each page
	const results: string[] = [];
	for (const pageFile of pageFiles) {
	const imgPath = path.join(tmpDir, pageFile);
	const ocrBase = path.join(tmpDir, `ocr-${pageFile}`);
	const ocrPath = ocrBase + ".txt";

	try {
	await new Promise<void>((resolve, reject) => {
	execFile(
	"tesseract",
	[imgPath, ocrBase, "-l", "eng+ind"],
	{ timeout: 60000 },
	(error: Error \| null) => {
	if (error) reject(error);
	else resolve();
	}
	);
	});

	const pageText = await readFile(ocrPath, "utf-8");
	if (pageText.trim()) {
	results.push(`--- Halaman ${results.length + 1} ---\n${pageText.trim()}`);
	}
	} catch {
	// Skip pages that fail OCR
	}
	}

	return results.length > 0
	? results.join("\n\n")
	: "(PDF berisi gambar tapi tidak dapat di-OCR)";
	} finally {
	// Clean up temp directory
	await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
	}
	}

	async function processDoc(buffer: Buffer): Promise<string> {
	const { writeFile, unlink } = require("fs/promises");
	const { tmpdir } = require("os");
	const path = require("path");
	const tmpPath = path.join(tmpdir(), `doc-${uuidv4()}.doc`);
	try {
	await writeFile(tmpPath, buffer);
	const WordExtractor = require("word-extractor");
	const extractor = new WordExtractor();
	const doc = await extractor.extract(tmpPath);
	return doc.getBody().trim();
	} finally {
	await unlink(tmpPath).catch(() => {});
	}
	}

	async function processDocx(buffer: Buffer): Promise<string> {
	const mammoth = await import("mammoth");
	const result = await mammoth.extractRawText({ buffer });
	return result.value.trim();
	}

	function processExcel(buffer: Buffer): string {
	const XLSX = require("xlsx");
	const workbook = XLSX.read(buffer, { type: "buffer" });
	const texts: string[] = [];

	for (const sheetName of workbook.SheetNames) {
	const sheet = workbook.Sheets[sheetName];
	const csv = XLSX.utils.sheet_to_csv(sheet);
	texts.push(`--- Sheet: ${sheetName} ---\n${csv}`);
	}

	return texts.join("\n\n");
	}

	function processCsv(buffer: Buffer): string {
	const XLSX = require("xlsx");
	const workbook = XLSX.read(buffer, { type: "buffer" });
	const sheet = workbook.Sheets[workbook.SheetNames[0]];
	return XLSX.utils.sheet_to_csv(sheet);
	}

	async function processImage(buffer: Buffer): Promise<string> {
	const { writeFile, readFile, unlink } = require("fs/promises");
	const { tmpdir } = require("os");
	const path = require("path");
	const { execFile } = require("child_process");

	const inputPath = path.join(tmpdir(), `ocr-${uuidv4()}`);
	const outputBase = path.join(tmpdir(), `ocr-out-${uuidv4()}`);
	const outputPath = outputBase + ".txt";

	try {
	await writeFile(inputPath, buffer);

	// Use system tesseract CLI — avoids Turbopack module resolution issues
	await new Promise<void>((resolve, reject) => {
	execFile(
	"tesseract",
	[inputPath, outputBase, "-l", "eng+ind"],
	{ timeout: 60000 },
	(error: Error \| null) => {
	if (error) reject(error);
	else resolve();
	}
	);
	});

	const text = await readFile(outputPath, "utf-8");
	return text.trim();
	} finally {
	await unlink(inputPath).catch(() => {});
	await unlink(outputPath).catch(() => {});
	}
	}

	function processText(buffer: Buffer): string {
	try {
	return buffer.toString("utf-8");
	} catch {
	return buffer.toString("latin1");
	}
	}