Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 6

Commit

5411a7d

1 Parent(s): ec9f00b

Push ingestion Py README

Browse files

Files changed (8) hide show

ingestion_js/app/api/files/chunks/route.ts +16 -0
ingestion_js/app/api/files/route.ts +14 -0
ingestion_js/app/api/upload/route.ts +93 -0
ingestion_js/lib/chunker.ts +62 -0
ingestion_js/lib/jobs.ts +26 -0
ingestion_js/lib/summarizer.ts +19 -0
ingestion_js/tsconfig.json +5 -1
ingestion_python/README.md +1 -1

ingestion_js/app/api/files/chunks/route.ts ADDED Viewed

	@@ -0,0 +1,16 @@

+import { NextRequest, NextResponse } from 'next/server'
+import { getFileChunks } from '@/lib/mongo'
+export const dynamic = 'force-dynamic'
+export const runtime = 'nodejs'
+export async function GET(req: NextRequest) {
+  const { searchParams } = new URL(req.url)
+  const user_id = searchParams.get('user_id') || ''
+  const project_id = searchParams.get('project_id') || ''
+  const filename = searchParams.get('filename') || ''
+  const limit = parseInt(searchParams.get('limit') || '20', 10)
+  if (!user_id || !project_id || !filename) return NextResponse.json({ error: 'user_id, project_id and filename are required' }, { status: 400 })
+  const chunks = await getFileChunks(user_id, project_id, filename, limit)
+  return NextResponse.json({ chunks })
+}

ingestion_js/app/api/files/route.ts ADDED Viewed

	@@ -0,0 +1,14 @@

+import { NextRequest, NextResponse } from 'next/server'
+import { listFiles } from '@/lib/mongo'
+export const dynamic = 'force-dynamic'
+export const runtime = 'nodejs'
+export async function GET(req: NextRequest) {
+  const { searchParams } = new URL(req.url)
+  const user_id = searchParams.get('user_id') || ''
+  const project_id = searchParams.get('project_id') || ''
+  if (!user_id || !project_id) return NextResponse.json({ error: 'user_id and project_id are required' }, { status: 400 })
+  const files = await listFiles(user_id, project_id)
+  return NextResponse.json({ files, filenames: files.map((f: any) => f.filename) })
+}

ingestion_js/app/api/upload/route.ts ADDED Viewed

	@@ -0,0 +1,93 @@

+import { NextRequest, NextResponse } from 'next/server'
+import { randomUUID } from 'crypto'
+import { extractPages } from '@/lib/parser'
+import { captionImage } from '@/lib/captioner'
+import { buildCardsFromPages } from '@/lib/chunker'
+import { embedRemote } from '@/lib/embedder'
+import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
+import { cheapSummarize } from '@/lib/summarizer'
+import { createJob, getJob, updateJob } from '@/lib/jobs'
+export const dynamic = 'force-dynamic'
+export const runtime = 'nodejs'
+export async function GET(req: NextRequest) {
+  // Status endpoint: /api/upload?job_id=...
+  const { searchParams } = new URL(req.url)
+  const job_id = searchParams.get('job_id')
+  if (!job_id) return NextResponse.json({ error: 'job_id is required' }, { status: 400 })
+  const job = await getJob(job_id)
+  if (!job) return NextResponse.json({ error: 'job not found' }, { status: 404 })
+  return NextResponse.json({ job_id, status: job.status, total: job.total, completed: job.completed, last_error: job.last_error })
+}
+export async function POST(req: NextRequest) {
+  const form = await req.formData()
+  const user_id = String(form.get('user_id') || '')
+  const project_id = String(form.get('project_id') || '')
+  const fileEntries = form.getAll('files') as File[]
+  const replaceRaw = form.get('replace_filenames') as string | null
+  const renameRaw = form.get('rename_map') as string | null
+  if (!user_id || !project_id || fileEntries.length === 0) {
+    return NextResponse.json({ error: 'user_id, project_id and files are required' }, { status: 400 })
+  }
+  const maxFiles = parseInt(process.env.MAX_FILES_PER_UPLOAD || '15', 10)
+  const maxMb = parseInt(process.env.MAX_FILE_MB || '50', 10)
+  if (fileEntries.length > maxFiles) return NextResponse.json({ error: `Too many files. Max ${maxFiles} allowed per upload.` }, { status: 400 })
+  let replaceSet = new Set<string>()
+  try { if (replaceRaw) replaceSet = new Set<string>(JSON.parse(replaceRaw)) } catch {}
+  let renameMap: Record<string, string> = {}
+  try { if (renameRaw) renameMap = JSON.parse(renameRaw) } catch {}
+  const preloaded: Array<{ name: string; buf: Buffer }> = []
+  for (const f of fileEntries) {
+    const arr = Buffer.from(await f.arrayBuffer())
+    const sizeMb = arr.byteLength / (1024 * 1024)
+    if (sizeMb > maxMb) return NextResponse.json({ error: `${f.name} exceeds ${maxMb} MB limit` }, { status: 400 })
+    const eff = renameMap[f.name] || f.name
+    preloaded.push({ name: eff, buf: arr })
+  }
+  const job_id = randomUUID()
+  await createJob(job_id, preloaded.length)
+  // Fire-and-forget background processing; response immediately
+  processAll(job_id, user_id, project_id, preloaded, replaceSet).catch(async (e) => {
+    await updateJob(job_id, { status: 'failed', last_error: String(e) })
+  })
+  return NextResponse.json({ job_id, status: 'processing', total_files: preloaded.length })
+}
+async function processAll(job_id: string, user_id: string, project_id: string, files: Array<{ name: string; buf: Buffer }>, replaceSet: Set<string>) {
+  for (let i = 0; i < files.length; i++) {
+    const { name: fname, buf } = files[i]
+    try {
+      if (replaceSet.has(fname)) {
+        await deleteFileData(user_id, project_id, fname)
+      }
+      const pages = await extractPages(fname, buf)
+      // Best-effort captioning: parser doesn’t expose images; keep behavior parity by skipping or integrating if images available.
+      // If images were available, we would append [Image] caption lines to page text here.
+      const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
+      const vectors = await embedRemote(cards.map(c => c.content))
+      for (let k = 0; k < cards.length; k++) (cards[k] as any).embedding = vectors[k]
+      await storeCards(cards)
+      const fullText = pages.map(p => p.text || '').join('\n\n')
+      const summary = await cheapSummarize(fullText, 6)
+      await upsertFileSummary(user_id, project_id, fname, summary)
+      await updateJob(job_id, { completed: i + 1, status: (i + 1) < files.length ? 'processing' as const : 'completed' as const })
+    } catch (e: any) {
+      await updateJob(job_id, { completed: i + 1, last_error: String(e) })
+    }
+  }
+}

ingestion_js/lib/chunker.ts ADDED Viewed

	@@ -0,0 +1,62 @@

+import slugify from 'slugify'
+import type { Page } from './parser'
+import { cheapSummarize, cleanChunkText } from './summarizer'
+const MAX_WORDS = 220
+const OVERLAP_WORDS = 40
+function byHeadings(text: string): string[] {
+  const lines = text.split('\n')
+  const parts: string[] = []
+  let current: string[] = []
+  const flush = () => { if (current.length) { parts.push(current.join('\n')); current = [] } }
+  const headingRe = /^(#+\s+|\d+\.|[A-Z][A-Za-z\s\-]{0,40}:?|^\s*\[[A-Za-z ]+\]\s*$)/
+  for (const ln of lines) {
+    if (headingRe.test(ln)) flush()
+    current.push(ln)
+  }
+  flush()
+  return parts.filter(p => p.trim().length > 0)
+}
+function createOverlappingChunks(blocks: string[]): string[] {
+  const out: string[] = []
+  let words: string[] = []
+  for (const b of blocks) {
+    words.push(...b.split(/\s+/))
+    while (words.length > MAX_WORDS) {
+      const chunk = words.slice(0, MAX_WORDS).join(' ')
+      out.push(chunk)
+      words = words.slice(MAX_WORDS - OVERLAP_WORDS)
+    }
+  }
+  if (words.length) out.push(words.join(' '))
+  return out
+}
+export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
+  let full = ''
+  for (const p of pages) full += `\n\n[[Page ${p.page_num}]]\n${(p.text || '').trim()}\n`
+  const coarse = byHeadings(full)
+  const chunks = createOverlappingChunks(coarse)
+  const out: any[] = []
+  for (let i = 0; i < chunks.length; i++) {
+    const cleaned = await cleanChunkText(chunks[i])
+    const topic = (await cheapSummarize(cleaned, 1)) || (cleaned.slice(0, 80) + '...')
+    const summary = await cheapSummarize(cleaned, 3)
+    const firstPage = pages[0]?.page_num ?? 1
+    const lastPage = pages[pages.length - 1]?.page_num ?? 1
+    out.push({
+      user_id,
+      project_id,
+      filename,
+      topic_name: topic.slice(0, 120),
+      summary,
+      content: cleaned,
+      page_span: [firstPage, lastPage],
+      card_id: `${slugify(filename)}-c${String(i + 1).padStart(4, '0')}`
+    })
+  }
+  return out
+}

ingestion_js/lib/jobs.ts ADDED Viewed

	@@ -0,0 +1,26 @@

+import { getMongo } from './mongo'
+export type JobDoc = {
+  _id: string
+  created_at: number
+  total: number
+  completed: number
+  status: 'processing' | 'completed' | 'failed'
+  last_error: string | null
+}
+export async function createJob(job_id: string, total: number) {
+  const { db } = await getMongo()
+  const doc: JobDoc = { _id: job_id, created_at: Date.now() / 1000, total, completed: 0, status: 'processing', last_error: null }
+  await db.collection('jobs').insertOne(doc)
+}
+export async function updateJob(job_id: string, fields: Partial<JobDoc>) {
+  const { db } = await getMongo()
+  await db.collection('jobs').updateOne({ _id: job_id }, { $set: fields })
+}
+export async function getJob(job_id: string) {
+  const { db } = await getMongo()
+  return db.collection('jobs').findOne({ _id: job_id })
+}

ingestion_js/lib/summarizer.ts ADDED Viewed

	@@ -0,0 +1,19 @@

+export async function cheapSummarize(text: string, maxSentences = 3): Promise<string> {
+  if (!text || text.trim().length < 50) return text.trim()
+  try {
+    const sentences = text.split(/(?<=[.!?])\s+/).filter(Boolean)
+    if (sentences.length <= maxSentences) return text.trim()
+    let out = sentences.slice(0, maxSentences).join(' ')
+    if (!/[.!?]$/.test(out)) out += '.'
+    return out
+  } catch {
+    return text.length > 200 ? text.slice(0, 200) + '...' : text
+  }
+}
+export async function cleanChunkText(text: string): Promise<string> {
+  let t = text
+  t = t.replace(/\n\s*Page \d+\s*\n/gi, '\n')
+  t = t.replace(/\s{3,}/g, ' ')
+  return t.trim()
+}

ingestion_js/tsconfig.json CHANGED Viewed

@@ -14,7 +14,11 @@
     "isolatedModules": true,
     "jsx": "preserve",
     "incremental": true,
-    "types": ["node"]
   },
   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
   "exclude": ["node_modules"]

     "isolatedModules": true,
     "jsx": "preserve",
     "incremental": true,
+    "types": ["node"],
+    "baseUrl": ".",
+    "paths": {
+      "@/*": ["./*"]
+    }
   },
   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
   "exclude": ["node_modules"]

ingestion_python/README.md CHANGED Viewed

@@ -13,7 +13,7 @@ short_description: 'backend for data ingestion'
 A dedicated service for processing file uploads and storing them in MongoDB Atlas. This service mirrors the main system's file processing functionality while running as a separate service to share the processing load.
-[API docs](API.md)  |  [System docs](COMPATIBILITY.md)
 ## 🏗️ Architecture

 A dedicated service for processing file uploads and storing them in MongoDB Atlas. This service mirrors the main system's file processing functionality while running as a separate service to share the processing load.
+[API docs](CURL.md)
 ## 🏗️ Architecture