from starlette.applications import Starlette from starlette.responses import JSONResponse, PlainTextResponse from starlette.routing import Route from starlette.requests import Request from starlette.middleware import Middleware from starlette.middleware.cors import CORSMiddleware import tempfile import shutil import os import pymupdf4llm from unstructured.partition.auto import partition from unstructured.cleaners.core import clean from chonkie import RecursiveChunker, RecursiveRules recipe = RecursiveRules.from_dict({ "name": "default", "schema": "v1", "description": "Default recipe for plaintext documents in Korean", "language": "kr", "metadata": { "version": "0.1.0", "author": "Chonkie Team" }, "recipe": { "delimiters": [".", "。", "!", "!", "?", "?", "\n"], "include_delim": "prev", "recursive_rules": { "levels": [ { "delimiters": [ "\n\n", "\n\r" ], "whitespace": False, "include_delim": "next" }, { "delimiters": [ "\n", "\r" ], "whitespace": False, "include_delim": "prev" }, { "delimiters": [ ".", "。", "!", "!", "?", "?" ], "whitespace": False, "include_delim": "prev" }, { "delimiters": None, "whitespace": True, "include_delim": "prev" }, { "delimiters": None, "whitespace": False, "include_delim": "prev" } ] } } }) chunker = RecursiveChunker(rules=recipe) # 문서 처리 함수 async def handle_file_upload(request: Request): form = await request.form() upload = form.get("file") if not upload or not upload.filename: return JSONResponse({"error": "파일을 업로드해주세요."}, status_code=400) filename = upload.filename ext = os.path.splitext(filename)[1].lower() # 임시 파일로 저장 with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: shutil.copyfileobj(upload.file, tmp) tmp_path = tmp.name try: if ext == ".pdf": # PDF 처리 markdown_text = pymupdf4llm.to_markdown(tmp_path) chunks = chunker(markdown_text) return JSONResponse([chunk.text for chunk in chunks]) else: # 비-PDF 문서 처리 elements = partition(tmp_path) cleaned_text = clean( "\n".join([str(el) for el in elements]), dashes=True, trailing_punctuation=True, ) chunks =chunker(cleaned_text) return JSONResponse([chunk.text for chunk in chunks]) except Exception as e: return JSONResponse({"error": f"문서 처리 실패: {str(e)}"}, status_code=500) finally: os.unlink(tmp_path) # 라우팅 설정 routes = [ Route("/upload", handle_file_upload, methods=["POST"]), ] # CORS 허용 (옵션) middleware = [ Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]), ] app = Starlette(debug=True, routes=routes, middleware=middleware)