rafmacalaba's picture
feat: multi-corpus support
a2c885c
import { HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
import fs from 'fs';
const isHFSpace = () => {
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
};
export async function GET(request) {
const { searchParams } = new URL(request.url);
const index = searchParams.get('index');
const page = searchParams.get('page');
const corpusId = searchParams.get('corpus');
if (index === null || page === null) {
return new Response(
JSON.stringify({ error: "Missing index or page parameter" }),
{ status: 400, headers: { 'Content-Type': 'application/json' } }
);
}
const indexNum = parseInt(index, 10);
const pageNum = parseInt(page, 10);
if (isNaN(indexNum) || isNaN(pageNum) || indexNum < 0 || pageNum < 0) {
return new Response(
JSON.stringify({ error: "index and page must be non-negative integers" }),
{ status: 400, headers: { 'Content-Type': 'application/json' } }
);
}
const corpus = getCorpus(corpusId);
try {
let pagesData;
if (isHFSpace()) {
const docRepoPath = getDocRepoPath(corpus, indexNum);
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
const res = await fetch(docUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
});
if (!res.ok) {
return new Response(
JSON.stringify({ error: `doc_${indexNum} not found on HF (${corpus.id})` }),
{ status: res.status, headers: { 'Content-Type': 'application/json' } }
);
}
pagesData = await res.json();
} else {
const filePath = getDocLocalPath(corpus, indexNum);
if (!fs.existsSync(filePath)) {
return new Response(
JSON.stringify({ error: `doc_${indexNum} not found locally (${corpus.id})` }),
{ status: 404, headers: { 'Content-Type': 'application/json' } }
);
}
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
}
const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
if (!pageData) {
return new Response(
JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum} (${corpus.id})` }),
{ status: 404, headers: { 'Content-Type': 'application/json' } }
);
}
return new Response(JSON.stringify(pageData), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
} catch (error) {
console.error(error);
return new Response(
JSON.stringify({ error: "Failed to fetch document page" }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
}