|
from starlette.applications import Starlette |
|
from starlette.responses import JSONResponse, PlainTextResponse |
|
from starlette.routing import Route |
|
from starlette.requests import Request |
|
from starlette.middleware import Middleware |
|
from starlette.middleware.cors import CORSMiddleware |
|
import tempfile |
|
import shutil |
|
import os |
|
|
|
import pymupdf4llm |
|
from unstructured.partition.auto import partition |
|
from unstructured.cleaners.core import clean |
|
from chonkie import RecursiveChunker, RecursiveRules |
|
|
|
recipe = RecursiveRules.from_dict({ |
|
"name": "default", |
|
"schema": "v1", |
|
"description": "Default recipe for plaintext documents in Korean", |
|
"language": "kr", |
|
"metadata": { |
|
"version": "0.1.0", |
|
"author": "Chonkie Team" |
|
}, |
|
"recipe": { |
|
"delimiters": [".", "ใ", "!", "๏ผ", "?", "๏ผ", "\n"], |
|
"include_delim": "prev", |
|
"recursive_rules": { |
|
"levels": |
|
[ |
|
{ |
|
"delimiters": [ |
|
"\n\n", |
|
"\n\r" |
|
], |
|
"whitespace": False, |
|
"include_delim": "next" |
|
}, |
|
{ |
|
"delimiters": [ |
|
"\n", |
|
"\r" |
|
], |
|
"whitespace": False, |
|
"include_delim": "prev" |
|
}, |
|
{ |
|
"delimiters": [ |
|
".", |
|
"ใ", |
|
"!", |
|
"๏ผ", |
|
"?", |
|
"๏ผ" |
|
], |
|
"whitespace": False, |
|
"include_delim": "prev" |
|
}, |
|
{ |
|
"delimiters": None, |
|
"whitespace": True, |
|
"include_delim": "prev" |
|
}, |
|
{ |
|
"delimiters": None, |
|
"whitespace": False, |
|
"include_delim": "prev" |
|
} |
|
] |
|
} |
|
} |
|
}) |
|
chunker = RecursiveChunker(rules=recipe) |
|
|
|
|
|
async def handle_file_upload(request: Request): |
|
form = await request.form() |
|
upload = form.get("file") |
|
|
|
if not upload or not upload.filename: |
|
return JSONResponse({"error": "ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์."}, status_code=400) |
|
|
|
filename = upload.filename |
|
ext = os.path.splitext(filename)[1].lower() |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: |
|
shutil.copyfileobj(upload.file, tmp) |
|
tmp_path = tmp.name |
|
|
|
try: |
|
if ext == ".pdf": |
|
|
|
markdown_text = pymupdf4llm.to_markdown(tmp_path) |
|
chunks = chunker(markdown_text) |
|
return JSONResponse([chunk.text for chunk in chunks]) |
|
else: |
|
|
|
elements = partition(tmp_path) |
|
cleaned_text = clean( |
|
"\n".join([str(el) for el in elements]), |
|
dashes=True, |
|
trailing_punctuation=True, |
|
) |
|
chunks =chunker(cleaned_text) |
|
return JSONResponse([chunk.text for chunk in chunks]) |
|
except Exception as e: |
|
return JSONResponse({"error": f"๋ฌธ์ ์ฒ๋ฆฌ ์คํจ: {str(e)}"}, status_code=500) |
|
finally: |
|
os.unlink(tmp_path) |
|
|
|
|
|
routes = [ |
|
Route("/upload", handle_file_upload, methods=["POST"]), |
|
] |
|
|
|
|
|
middleware = [ |
|
Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]), |
|
] |
|
|
|
app = Starlette(debug=True, routes=routes, middleware=middleware) |