rafmacalaba's picture
feat: multi-corpus support
a2c885c
import { NextResponse } from 'next/server';
import fs from 'fs';
import { commit } from '@huggingface/hub';
import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
/**
* PUT /api/validate
* Body: { corpus, document_index, page_number, dataset_index, updates }
*/
export async function PUT(request) {
try {
const { corpus: corpusId, document_index, page_number, dataset_index, updates } = await request.json();
const corpus = getCorpus(corpusId);
if (document_index == null || page_number == null || dataset_index == null || !updates) {
return NextResponse.json(
{ error: 'Missing document_index, page_number, dataset_index, or updates' },
{ status: 400 }
);
}
let pagesData;
if (isHFSpace()) {
const repoPath = getDocRepoPath(corpus, document_index);
const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
const res = await fetch(url, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
});
if (!res.ok) {
return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
}
pagesData = await res.json();
} else {
const filePath = getDocLocalPath(corpus, document_index);
if (!fs.existsSync(filePath)) {
return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
}
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
}
const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
if (pageIdx === -1) {
return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
}
const datasets = pagesData[pageIdx].datasets || [];
if (dataset_index < 0 || dataset_index >= datasets.length) {
return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
}
// Per-annotator validation
const currentEntry = pagesData[pageIdx].datasets[dataset_index];
const annotator = updates.annotator || 'unknown';
const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
const isValidation = validationFields.some(f => f in updates);
if (isValidation) {
const validations = currentEntry.validations || [];
const existingIdx = validations.findIndex(v => v.annotator === annotator);
const validationEntry = {
human_validated: updates.human_validated,
human_verdict: updates.human_verdict,
human_notes: updates.human_notes || null,
annotator,
validated_at: updates.validated_at || new Date().toISOString(),
};
if (existingIdx >= 0) {
validations[existingIdx] = validationEntry;
} else {
validations.push(validationEntry);
}
pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, validations };
} else {
pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, ...updates };
}
// Save back
if (isHFSpace()) {
const repoPath = getDocRepoPath(corpus, document_index);
const content = JSON.stringify(pagesData, null, 2);
await commit({
repo: { type: 'dataset', name: HF_DATASET_ID },
credentials: { accessToken: process.env.HF_TOKEN },
title: `Validate ${corpus.id}/doc_${document_index} page ${page_number}`,
operations: [{
operation: 'addOrUpdate',
path: repoPath,
content: new Blob([content], { type: 'application/json' }),
}],
});
} else {
const filePath = getDocLocalPath(corpus, document_index);
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
}
return NextResponse.json({
success: true,
dataset: pagesData[pageIdx].datasets[dataset_index],
});
} catch (error) {
console.error('Validate error:', error);
return NextResponse.json({ error: 'Failed to validate: ' + error.message }, { status: 500 });
}
}
/**
* DELETE /api/validate?corpus=X&doc=X&page=Y&idx=Z
*/
export async function DELETE(request) {
try {
const { searchParams } = new URL(request.url);
const corpusId = searchParams.get('corpus');
const document_index = parseInt(searchParams.get('doc'), 10);
const page_number = parseInt(searchParams.get('page'), 10);
const dataset_index = parseInt(searchParams.get('idx'), 10);
const corpus = getCorpus(corpusId);
if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
return NextResponse.json(
{ error: 'Missing doc, page, or idx parameter' },
{ status: 400 }
);
}
let pagesData;
if (isHFSpace()) {
const repoPath = getDocRepoPath(corpus, document_index);
const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
const res = await fetch(url, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
});
if (!res.ok) {
return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
}
pagesData = await res.json();
} else {
const filePath = getDocLocalPath(corpus, document_index);
if (!fs.existsSync(filePath)) {
return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
}
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
}
const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
if (pageIdx === -1) {
return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
}
const datasets = pagesData[pageIdx].datasets || [];
if (dataset_index < 0 || dataset_index >= datasets.length) {
return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
}
pagesData[pageIdx].datasets.splice(dataset_index, 1);
if (isHFSpace()) {
const repoPath = getDocRepoPath(corpus, document_index);
const content = JSON.stringify(pagesData, null, 2);
await commit({
repo: { type: 'dataset', name: HF_DATASET_ID },
credentials: { accessToken: process.env.HF_TOKEN },
title: `Delete from ${corpus.id}/doc_${document_index} page ${page_number}`,
operations: [{
operation: 'addOrUpdate',
path: repoPath,
content: new Blob([content], { type: 'application/json' }),
}],
});
} else {
const filePath = getDocLocalPath(corpus, document_index);
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
}
return NextResponse.json({ success: true });
} catch (error) {
console.error('Delete error:', error);
return NextResponse.json({ error: 'Failed to delete: ' + error.message }, { status: 500 });
}
}