package / main.py
Doramong's picture
Upload main.py with huggingface_hub
6a80b48 verified
from starlette.applications import Starlette
from starlette.responses import JSONResponse, PlainTextResponse
from starlette.routing import Route
from starlette.requests import Request
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
import tempfile
import shutil
import os
import pymupdf4llm
from unstructured.partition.auto import partition
from unstructured.cleaners.core import clean
from chonkie import RecursiveChunker, RecursiveRules
recipe = RecursiveRules.from_dict({
"name": "default",
"schema": "v1",
"description": "Default recipe for plaintext documents in Korean",
"language": "kr",
"metadata": {
"version": "0.1.0",
"author": "Chonkie Team"
},
"recipe": {
"delimiters": [".", "ใ€‚", "!", "๏ผ", "?", "๏ผŸ", "\n"],
"include_delim": "prev",
"recursive_rules": {
"levels":
[
{
"delimiters": [
"\n\n",
"\n\r"
],
"whitespace": False,
"include_delim": "next"
},
{
"delimiters": [
"\n",
"\r"
],
"whitespace": False,
"include_delim": "prev"
},
{
"delimiters": [
".",
"ใ€‚",
"!",
"๏ผ",
"?",
"๏ผŸ"
],
"whitespace": False,
"include_delim": "prev"
},
{
"delimiters": None,
"whitespace": True,
"include_delim": "prev"
},
{
"delimiters": None,
"whitespace": False,
"include_delim": "prev"
}
]
}
}
})
chunker = RecursiveChunker(rules=recipe)
# ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
async def handle_file_upload(request: Request):
form = await request.form()
upload = form.get("file")
if not upload or not upload.filename:
return JSONResponse({"error": "ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”."}, status_code=400)
filename = upload.filename
ext = os.path.splitext(filename)[1].lower()
# ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
shutil.copyfileobj(upload.file, tmp)
tmp_path = tmp.name
try:
if ext == ".pdf":
# PDF ์ฒ˜๋ฆฌ
markdown_text = pymupdf4llm.to_markdown(tmp_path)
chunks = chunker(markdown_text)
return JSONResponse([chunk.text for chunk in chunks])
else:
# ๋น„-PDF ๋ฌธ์„œ ์ฒ˜๋ฆฌ
elements = partition(tmp_path)
cleaned_text = clean(
"\n".join([str(el) for el in elements]),
dashes=True,
trailing_punctuation=True,
)
chunks =chunker(cleaned_text)
return JSONResponse([chunk.text for chunk in chunks])
except Exception as e:
return JSONResponse({"error": f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹คํŒจ: {str(e)}"}, status_code=500)
finally:
os.unlink(tmp_path)
# ๋ผ์šฐํŒ… ์„ค์ •
routes = [
Route("/upload", handle_file_upload, methods=["POST"]),
]
# CORS ํ—ˆ์šฉ (์˜ต์…˜)
middleware = [
Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]),
]
app = Starlette(debug=True, routes=routes, middleware=middleware)