Spaces:
Sleeping
Sleeping
Auto commit at 23-2025-08 3:51:46
Browse files- lily_llm_api/app_v2.py +179 -103
- lily_llm_core/document_processor.py +692 -469
- lily_llm_core/integrated_memory_manager.py +404 -0
- lily_llm_core/room_context_manager.py +374 -0
- lily_llm_core/user_memory_manager.py +302 -0
- room_contexts/default.json +60 -0
- user_memories/anonymous.json +18 -0
- user_memories/kdy.json +22 -0
lily_llm_api/app_v2.py
CHANGED
@@ -60,6 +60,9 @@ from lily_llm_core.hybrid_rag_processor import hybrid_rag_processor
|
|
60 |
# ์ปจํ
์คํธ ๊ด๋ฆฌ์ ๋ฐ LoRA ๊ด๋ฆฌ์ ์ถ๊ฐ
|
61 |
from lily_llm_core.context_manager import get_context_manager, context_manager
|
62 |
|
|
|
|
|
|
|
63 |
# ์ ์ญ ๋ณ์๋ค
|
64 |
current_model = None # ๐ ํ์ฌ ๋ก๋๋ ๋ชจ๋ธ ์ธ์คํด์ค
|
65 |
current_profile = None # ๐ ํ์ฌ ์ ํ๋ ๋ชจ๋ธ ํ๋กํ
|
@@ -491,7 +494,8 @@ def load_model_sync(model_id: str):
|
|
491 |
|
492 |
def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
|
493 |
temperature: Optional[float] = None, top_p: Optional[float] = None,
|
494 |
-
do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None
|
|
|
495 |
"""[์ต์ ํ] ๋ชจ๋ธ ์์ฑ์ ์ฒ๋ฆฌํ๋ ํตํฉ ๋๊ธฐ ํจ์"""
|
496 |
try:
|
497 |
print(f"๐ [DEBUG] generate_sync ์์ - prompt ๊ธธ์ด: {len(prompt)}")
|
@@ -515,16 +519,24 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
515 |
combined_image_metas = None
|
516 |
|
517 |
# --- 1. ์ด๋ฏธ์ง ์ฒ๋ฆฌ (๊ณต์ ๋ฐฉ์) ---
|
518 |
-
|
519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
|
521 |
# ๐ ๊ณต์ ๋ฐฉ์: ๊ฐ๋จํ ์ด๋ฏธ์ง ์ฒ๋ฆฌ
|
522 |
-
max_images = min(len(
|
523 |
logger.info(f"๐ผ๏ธ ๋ฉํฐ๋ชจ๋ฌ ์ฒ๋ฆฌ ์์... (์ด๋ฏธ์ง {max_images}๊ฐ)")
|
524 |
|
525 |
try:
|
526 |
metas_list = []
|
527 |
-
for idx, image_bytes in enumerate(
|
528 |
if image_bytes:
|
529 |
try:
|
530 |
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
@@ -568,31 +580,77 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
568 |
|
569 |
# 2. RAG ๊ฒ์ ๊ฒฐ๊ณผ ์ปจํ
์คํธ (PDF ๋ด์ฉ ํฌํจ)
|
570 |
try:
|
571 |
-
# ๐
|
572 |
rag_context = ""
|
573 |
|
574 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
try:
|
576 |
-
#
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
|
|
|
|
|
|
|
|
586 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
587 |
if os.path.exists(user_path):
|
588 |
-
|
589 |
-
if
|
590 |
-
# ์ต๊ทผ ๋ฌธ์ ID ์ฌ์ฉ
|
591 |
-
recent_doc_id = doc_dirs[-1]
|
592 |
-
print(f"๐ [DEBUG] RAG ์ปจํ
์คํธ ๊ฒ์: ์ฌ์ฉ์={user_dir}, ๋ฌธ์={recent_doc_id}")
|
593 |
|
594 |
# ๋ฌธ์ ๋ด์ฉ ํ์ผ ์ง์ ์ฝ๊ธฐ (pickle ํ์ผ ์ง์)
|
595 |
-
doc_path = os.path.join(user_path, recent_doc_id)
|
596 |
if os.path.exists(doc_path):
|
597 |
# ๐ pickle ํ์ผ์์ ๋ด์ฉ ์ฝ๊ธฐ (์ฐ์ )
|
598 |
pickle_file = os.path.join(doc_path, "simple_vector_store.pkl")
|
@@ -607,18 +665,64 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
607 |
documents_data = vector_store_data['documents']
|
608 |
if documents_data and len(documents_data) > 0:
|
609 |
rag_context = "\n\n๐ ์
๋ก๋๋ ๋ฌธ์ ๋ด์ฉ:\n"
|
|
|
|
|
|
|
610 |
for i, doc in enumerate(documents_data[:2]): # ์ต๋ 2๊ฐ ์ฒญํฌ
|
611 |
if hasattr(doc, 'page_content'):
|
612 |
content = doc.page_content.strip()
|
613 |
if content and len(content) > 30:
|
614 |
-
#
|
615 |
-
|
616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
|
618 |
if len(rag_context) > 30:
|
619 |
context_prompt += rag_context
|
620 |
print(f"๐ [DEBUG] RAG ์ปจํ
์คํธ ํฌํจ๋จ - ๊ธธ์ด: {len(rag_context)}")
|
621 |
print(f"๐ [DEBUG] RAG ์ปจํ
์คํธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {rag_context[:100]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
622 |
else:
|
623 |
print(f"โ ๏ธ [DEBUG] RAG ์ปจํ
์คํธ๊ฐ ๋๋ฌด ์งง์: {len(rag_context)}")
|
624 |
else:
|
@@ -731,7 +835,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
731 |
# --- 3. ํ ํฌ๋์ด์ง ---
|
732 |
print(f"๐ [DEBUG] ํ ํฌ๋์ด์ง ์์")
|
733 |
t_tok_start = time.time()
|
734 |
-
if not
|
735 |
# ํ
์คํธ-only ๊ณ ์ ๊ฒฝ๋ก (๋ ๋น ๋ฆ)
|
736 |
print(f"๐ [DEBUG] ํ
์คํธ-only ํ ํฌ๋์ด์ง ๊ฒฝ๋ก")
|
737 |
print(f"๐ [DEBUG] ์ฌ์ฉํ ํ๋กฌํํธ: {formatted_prompt}")
|
@@ -754,6 +858,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
754 |
# ๋ฉํฐ๋ชจ๋ฌ(Lite): Kanana ์ ์ฉ encode_prompt๋ก -1 ํ ํฐ ์๋ฆฌ ์์ฑ (ํ์)
|
755 |
print(f"๐ [DEBUG] ๋ฉํฐ๋ชจ๋ฌ ํ ํฌ๋์ด์ง ๊ฒฝ๋ก")
|
756 |
print(f"๐ [DEBUG] combined_image_metas: {combined_image_metas}")
|
|
|
757 |
|
758 |
if hasattr(tokenizer, 'encode_prompt'):
|
759 |
print(f"๐ [DEBUG] encode_prompt ๋ฉ์๋ ์ฌ์ฉ")
|
@@ -1352,6 +1457,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
1352 |
total_time = time.time() - t_tok_start
|
1353 |
print(f"๐ [DEBUG] ์ ์ฒด ์ฒ๋ฆฌ ์๋ฃ - ์ด ์์์๊ฐ: {total_time:.3f}์ด")
|
1354 |
|
|
|
|
|
1355 |
return {
|
1356 |
"generated_text": response,
|
1357 |
"processing_time": total_time,
|
@@ -1527,6 +1634,8 @@ async def generate(request: Request,
|
|
1527 |
image2: UploadFile = File(None),
|
1528 |
image3: UploadFile = File(None),
|
1529 |
image4: UploadFile = File(None),
|
|
|
|
|
1530 |
use_context: bool = Form(True),
|
1531 |
session_id: str = Form(None)):
|
1532 |
|
@@ -1535,20 +1644,12 @@ async def generate(request: Request,
|
|
1535 |
|
1536 |
start_time = time.time()
|
1537 |
|
1538 |
-
# ์ธ์
ID๊ฐ ์์ผ๋ฉด ์๋ ์์ฑ (
|
1539 |
if not session_id:
|
1540 |
-
#
|
1541 |
-
|
1542 |
-
|
1543 |
-
|
1544 |
-
client_ip = request.client.host if request.client else "unknown"
|
1545 |
-
except:
|
1546 |
-
pass
|
1547 |
-
|
1548 |
-
# ํด๋ผ์ด์ธํธ IP + ์๊ฐ ๊ธฐ๋ฐ์ผ๋ก ์ธ์
์์ฑ (ํ๋ฃจ ๋์ ์ ์ง)
|
1549 |
-
day_timestamp = int(time.time() // 86400) * 86400 # ํ๋ฃจ ๋จ์๋ก ๋ฐ์ฌ๋ฆผ
|
1550 |
-
session_id = f"client_{client_ip}_{day_timestamp}"
|
1551 |
-
print(f"๐ [DEBUG] ์๋ ์ธ์
ID ์์ฑ: {session_id} (ํด๋ผ์ด์ธํธ: {client_ip})")
|
1552 |
|
1553 |
if use_context:
|
1554 |
context_manager.add_user_message(prompt, metadata={"session_id": session_id})
|
@@ -1566,7 +1667,7 @@ async def generate(request: Request,
|
|
1566 |
|
1567 |
try:
|
1568 |
# generate_sync ํจ์ ํธ์ถ (์ปจํ
์คํธ ํฌํจ)
|
1569 |
-
result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id)
|
1570 |
|
1571 |
if "error" in result:
|
1572 |
raise HTTPException(status_code=500, detail=result["error"])
|
@@ -1716,6 +1817,7 @@ async def health_check():
|
|
1716 |
async def upload_document(
|
1717 |
file: UploadFile = File(...),
|
1718 |
user_id: str = Form("default_user"), # ๊ธฐ๋ณธ ์ฌ์ฉ์ ID
|
|
|
1719 |
document_id: Optional[str] = Form(None) # ๋ฌธ์ ID (์๋ ์์ฑ ๊ฐ๋ฅ)
|
1720 |
):
|
1721 |
"""๋ฌธ์ ์
๋ก๋ ๋ฐ RAG ์ฒ๋ฆฌ"""
|
@@ -1746,73 +1848,47 @@ async def upload_document(
|
|
1746 |
processing_time = time.time() - start_time
|
1747 |
logger.info(f"๐ ๋ฌธ์ ์
๋ก๋ ์๋ฃ ({processing_time:.2f}์ด): {file.filename}")
|
1748 |
|
1749 |
-
#
|
1750 |
if result["success"]:
|
1751 |
try:
|
1752 |
-
#
|
1753 |
-
|
|
|
1754 |
|
1755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1756 |
|
1757 |
-
# ๐ RAG ์์คํ
์ ์ฌ์ฉํ์ฌ ๋ฌธ์ ๋ด์ฉ ๊ธฐ๋ฐ ์๋ต ์์ฑ
|
1758 |
-
try:
|
1759 |
-
# RAG ๊ฒ์์ ํตํด ๋ฌธ์ ๋ด์ฉ์ ํฌํจํ ํ๋กฌํํธ ์์ฑ
|
1760 |
-
summary_query = f"์
๋ก๋๋ ๋ฌธ์ '{file.filename}'์ ์ฃผ์ ๋ด์ฉ์ ์์ฝํด์ฃผ์ธ์."
|
1761 |
-
|
1762 |
-
# RAG ์๋ต ์์ฑ (๋ฌธ์ ๋ด์ฉ ํฌํจ)
|
1763 |
-
rag_result = rag_processor.generate_rag_response(
|
1764 |
-
user_id, document_id, summary_query, llm_model=model
|
1765 |
-
)
|
1766 |
-
|
1767 |
-
if rag_result["success"]:
|
1768 |
-
logger.info(f"โ
์๋ AI ์๋ต ์์ฑ ์๋ฃ: {len(rag_result['response'])} ๋ฌธ์")
|
1769 |
-
result["auto_response"] = rag_result["response"]
|
1770 |
-
else:
|
1771 |
-
logger.warning(f"โ ๏ธ ์๋ AI ์๋ต ์์ฑ ์คํจ: {rag_result.get('error', 'Unknown error')}")
|
1772 |
-
# ๐ RAG ์คํจ ์ generate_sync ์ฌ์ฉ (fallback)
|
1773 |
-
if model and hasattr(model, 'generate'):
|
1774 |
-
try:
|
1775 |
-
from .app_v2 import generate_sync
|
1776 |
-
|
1777 |
-
# ๋ฌธ์ ๋ด์ฉ์ ํฌํจํ ํ๋กฌํํธ ์์ฑ
|
1778 |
-
context_prompt = f"""
|
1779 |
-
๋ค์ ๋ฌธ์์ ๋ด์ฉ์ ๋ฐํ์ผ๋ก ์์ฝํด์ฃผ์ธ์:
|
1780 |
-
|
1781 |
-
๋ฌธ์๋ช
: {file.filename}
|
1782 |
-
๋ฌธ์ ID: {document_id}
|
1783 |
-
|
1784 |
-
๋ฌธ์ ๋ด์ฉ:
|
1785 |
-
{result.get('chunks', [])}
|
1786 |
-
|
1787 |
-
์ ๋ฌธ์์ ์ฃผ์ ๋ด์ฉ์ ์์ฝํด์ฃผ์ธ์.
|
1788 |
-
"""
|
1789 |
-
|
1790 |
-
response = generate_sync(
|
1791 |
-
prompt=context_prompt,
|
1792 |
-
image_data_list=None,
|
1793 |
-
session_id=None
|
1794 |
-
)
|
1795 |
-
|
1796 |
-
if response and "response" in response:
|
1797 |
-
result["auto_response"] = response["response"]
|
1798 |
-
logger.info(f"โ
fallback AI ์๋ต ์์ฑ ์๋ฃ: {len(response['response'])} ๋ฌธ์")
|
1799 |
-
else:
|
1800 |
-
result["auto_response"] = "๋ฌธ์ ์์ฝ์ ์์ฑํ ์ ์์ต๋๋ค."
|
1801 |
-
logger.warning(f"โ ๏ธ generate_sync ์๋ต ํ์ ์ค๋ฅ")
|
1802 |
-
|
1803 |
-
except Exception as e:
|
1804 |
-
logger.error(f"โ fallback generate_sync ํธ์ถ ์คํจ: {e}")
|
1805 |
-
result["auto_response"] = "๋ฌธ์ ์์ฝ ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค."
|
1806 |
-
else:
|
1807 |
-
result["auto_response"] = "๋ฌธ์ ์์ฝ์ ์์ฑํ ์ ์์ต๋๋ค."
|
1808 |
-
|
1809 |
-
except Exception as e:
|
1810 |
-
logger.error(f"โ RAG ์๋ต ์์ฑ ์ค ์ค๋ฅ: {e}")
|
1811 |
-
result["auto_response"] = "๋ฌธ์ ์์ฝ ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค."
|
1812 |
-
|
1813 |
except Exception as e:
|
1814 |
-
logger.
|
1815 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1816 |
|
1817 |
return DocumentUploadResponse(
|
1818 |
success=result["success"],
|
|
|
60 |
# ์ปจํ
์คํธ ๊ด๋ฆฌ์ ๋ฐ LoRA ๊ด๋ฆฌ์ ์ถ๊ฐ
|
61 |
from lily_llm_core.context_manager import get_context_manager, context_manager
|
62 |
|
63 |
+
# ๊ณ์ธต์ ๋ฉ๋ชจ๋ฆฌ ์์คํ
์ถ๊ฐ
|
64 |
+
from lily_llm_core.integrated_memory_manager import integrated_memory_manager
|
65 |
+
|
66 |
# ์ ์ญ ๋ณ์๋ค
|
67 |
current_model = None # ๐ ํ์ฌ ๋ก๋๋ ๋ชจ๋ธ ์ธ์คํด์ค
|
68 |
current_profile = None # ๐ ํ์ฌ ์ ํ๋ ๋ชจ๋ธ ํ๋กํ
|
|
|
494 |
|
495 |
def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
|
496 |
temperature: Optional[float] = None, top_p: Optional[float] = None,
|
497 |
+
do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
|
498 |
+
user_id: str = "anonymous", room_id: str = "default") -> dict:
|
499 |
"""[์ต์ ํ] ๋ชจ๋ธ ์์ฑ์ ์ฒ๋ฆฌํ๋ ํตํฉ ๋๊ธฐ ํจ์"""
|
500 |
try:
|
501 |
print(f"๐ [DEBUG] generate_sync ์์ - prompt ๊ธธ์ด: {len(prompt)}")
|
|
|
519 |
combined_image_metas = None
|
520 |
|
521 |
# --- 1. ์ด๋ฏธ์ง ์ฒ๋ฆฌ (๊ณต์ ๋ฐฉ์) ---
|
522 |
+
# ๐ RAG์์ ์ถ์ถ๋ ์ด๋ฏธ์ง ๋ฐ์ดํฐ๋ ํฌํจ
|
523 |
+
all_image_data = []
|
524 |
+
if image_data_list and len([img for img in image_data_list if img]) > 0:
|
525 |
+
all_image_data.extend(image_data_list)
|
526 |
+
print(f"๐ [DEBUG] ์ง์ ์ ๋ฌ๋ ์ด๋ฏธ์ง {len(image_data_list)}๊ฐ ์ถ๊ฐ")
|
527 |
+
|
528 |
+
# ๐ RAG์์ ์ถ์ถ๋ ์ด๋ฏธ์ง ๋ฐ์ดํฐ๋ ํ์ฌ ๊ตฌํ์์ ์ ๊ฑฐ๋จ (์ ์ญ ๋ณ์ ๋ฌธ์ ํด๊ฒฐ)
|
529 |
+
|
530 |
+
if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
|
531 |
+
print(f"๐ [DEBUG] ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์์ - ์ด ์ด๋ฏธ์ง ๊ฐ์: {len([img for img in all_image_data if img])}")
|
532 |
|
533 |
# ๐ ๊ณต์ ๋ฐฉ์: ๊ฐ๋จํ ์ด๋ฏธ์ง ์ฒ๋ฆฌ
|
534 |
+
max_images = min(len(all_image_data), 4)
|
535 |
logger.info(f"๐ผ๏ธ ๋ฉํฐ๋ชจ๋ฌ ์ฒ๋ฆฌ ์์... (์ด๋ฏธ์ง {max_images}๊ฐ)")
|
536 |
|
537 |
try:
|
538 |
metas_list = []
|
539 |
+
for idx, image_bytes in enumerate(all_image_data[:max_images]):
|
540 |
if image_bytes:
|
541 |
try:
|
542 |
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
|
580 |
|
581 |
# 2. RAG ๊ฒ์ ๊ฒฐ๊ณผ ์ปจํ
์คํธ (PDF ๋ด์ฉ ํฌํจ)
|
582 |
try:
|
583 |
+
# ๐ ์๋ก์ด ๋ฉ๋ชจ๋ฆฌ ์์คํ
์ ์ฌ์ฉํ RAG ์ปจํ
์คํธ ๋ก๋
|
584 |
rag_context = ""
|
585 |
|
586 |
+
# ํตํฉ ๋ฉ๋ชจ๋ฆฌ ๊ด๋ฆฌ์์์ AI์ฉ ์ปจํ
์คํธ ์์ฑ
|
587 |
+
ai_context = integrated_memory_manager.get_context_for_ai(
|
588 |
+
user_id=user_id,
|
589 |
+
room_id=room_id,
|
590 |
+
session_id=session_id,
|
591 |
+
include_user_memory=True,
|
592 |
+
include_room_context=True,
|
593 |
+
include_session_history=False # ํ์ฌ ๋ํ๋ ๋ณ๋๋ก ์ฒ๋ฆฌ
|
594 |
+
)
|
595 |
+
|
596 |
+
if ai_context:
|
597 |
+
rag_context += f"\n\n๐ ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ:\n{ai_context}\n"
|
598 |
+
print(f"๐ [DEBUG] ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ํฌํจ๋จ - ๊ธธ์ด: {len(ai_context)}")
|
599 |
+
|
600 |
+
# ๊ธฐ์กด RAG ์์คํ
์์ ๋ฌธ์ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ (room_id ๊ธฐ๋ฐ)
|
601 |
try:
|
602 |
+
# ์ฑํ
๋ฐฉ๋ณ ๋ฌธ์ ์ปจํ
์คํธ ์กฐํ
|
603 |
+
room_context = integrated_memory_manager.room_context_manager.get_room_context(room_id)
|
604 |
+
if room_context and room_context.documents:
|
605 |
+
rag_context += "\n\n๐ ์
๋ก๋๋ ๋ฌธ์ ๋ชฉ๋ก:\n"
|
606 |
+
for doc in room_context.documents[-3:]: # ์ต๊ทผ 3๊ฐ๋ง
|
607 |
+
# ๋์
๋๋ฆฌ์ ๊ฐ์ฒด ๋ชจ๋ ์ฒ๋ฆฌ
|
608 |
+
if isinstance(doc, dict):
|
609 |
+
filename = doc.get('filename', 'unknown')
|
610 |
+
doc_type = doc.get('document_type', 'unknown')
|
611 |
+
page_count = doc.get('page_count', 0)
|
612 |
+
else:
|
613 |
+
filename = getattr(doc, 'filename', 'unknown')
|
614 |
+
doc_type = getattr(doc, 'document_type', 'unknown')
|
615 |
+
page_count = getattr(doc, 'page_count', 0)
|
616 |
|
617 |
+
rag_context += f" - {filename} ({doc_type}, {page_count}ํ์ด์ง)\n"
|
618 |
+
|
619 |
+
print(f"๐ [DEBUG] ์ฑํ
๋ฐฉ {room_id}์ ๋ฌธ์ {len(room_context.documents)}๊ฐ ๋ฐ๊ฒฌ")
|
620 |
+
|
621 |
+
except Exception as e:
|
622 |
+
print(f"โ ๏ธ ์ฑํ
๋ฐฉ ๋ฌธ์ ์ปจํ
์คํธ ๋ก๋ ์คํจ: {e}")
|
623 |
+
|
624 |
+
# ์๋ก์ด ๋ฉ๋ชจ๋ฆฌ ์์คํ
๊ธฐ๋ฐ RAG ์ปจํ
์คํธ ๋ก๋
|
625 |
+
try:
|
626 |
+
# ํ์ฌ ์ฑํ
๋ฐฉ์ ์ต์ ๋ฌธ์ ID ์ฌ์ฉ
|
627 |
+
room_context = integrated_memory_manager.room_context_manager.get_room_context(room_id)
|
628 |
+
if room_context and room_context.documents:
|
629 |
+
# ๊ฐ์ฅ ์ต๊ทผ์ ์
๋ก๋๋ ๋ฌธ์ ์ฌ์ฉ
|
630 |
+
latest_doc = room_context.documents[-1]
|
631 |
+
|
632 |
+
# ๋์
๋๋ฆฌ์ ๊ฐ์ฒด ๋ชจ๋ ์ฒ๋ฆฌ
|
633 |
+
if isinstance(latest_doc, dict):
|
634 |
+
latest_doc_id = latest_doc.get('document_id', 'unknown')
|
635 |
+
latest_user_id = latest_doc.get('uploaded_by', 'unknown')
|
636 |
+
else:
|
637 |
+
latest_doc_id = getattr(latest_doc, 'document_id', 'unknown')
|
638 |
+
latest_user_id = getattr(latest_doc, 'uploaded_by', 'unknown')
|
639 |
+
|
640 |
+
print(f"๐ [DEBUG] ์๋ก์ด RAG ์ปจํ
์คํธ ๊ฒ์: ์ฑํ
๋ฐฉ={room_id}, ์ฌ์ฉ์={latest_user_id}, ๋ฌธ์={latest_doc_id}")
|
641 |
+
|
642 |
+
# vector_stores ๋๋ ํ ๋ฆฌ์์ ํด๋น ๋ฌธ์ ๋ด์ฉ ์ฝ๊ธฐ
|
643 |
+
import os
|
644 |
+
import json
|
645 |
+
|
646 |
+
vector_store_dir = "vector_stores"
|
647 |
+
if os.path.exists(vector_store_dir):
|
648 |
+
user_path = os.path.join(vector_store_dir, latest_user_id)
|
649 |
if os.path.exists(user_path):
|
650 |
+
doc_path = os.path.join(user_path, latest_doc_id)
|
651 |
+
if os.path.exists(doc_path):
|
|
|
|
|
|
|
652 |
|
653 |
# ๋ฌธ์ ๋ด์ฉ ํ์ผ ์ง์ ์ฝ๊ธฐ (pickle ํ์ผ ์ง์)
|
|
|
654 |
if os.path.exists(doc_path):
|
655 |
# ๐ pickle ํ์ผ์์ ๋ด์ฉ ์ฝ๊ธฐ (์ฐ์ )
|
656 |
pickle_file = os.path.join(doc_path, "simple_vector_store.pkl")
|
|
|
665 |
documents_data = vector_store_data['documents']
|
666 |
if documents_data and len(documents_data) > 0:
|
667 |
rag_context = "\n\n๐ ์
๋ก๋๋ ๋ฌธ์ ๋ด์ฉ:\n"
|
668 |
+
|
669 |
+
# ๐ ์ค๋ฌด ์์ค ๊ตฌ์กฐํ๋ PDF ๋ฐ์ดํฐ ์ฒ๋ฆฌ
|
670 |
+
extracted_images = []
|
671 |
for i, doc in enumerate(documents_data[:2]): # ์ต๋ 2๊ฐ ์ฒญํฌ
|
672 |
if hasattr(doc, 'page_content'):
|
673 |
content = doc.page_content.strip()
|
674 |
if content and len(content) > 30:
|
675 |
+
# ๊ตฌ์กฐํ๋ ์ปจํ
์ธ ์ธ์ง ํ์ธ
|
676 |
+
if "=== ํ์ด์ง" in content and "[ํ
์คํธ ๋ธ๋ก" in content:
|
677 |
+
# ์ค๋ฌด ์์ค ๊ตฌ์กฐํ๋ ์ปจํ
์ธ
|
678 |
+
truncated_content = content[:400] + "..." if len(content) > 400 else content
|
679 |
+
rag_context += f"--- ๊ตฌ์กฐํ๋ ํ์ด์ง {i+1} ---\n{truncated_content}\n\n"
|
680 |
+
print(f"๐ [DEBUG] ๊ตฌ์กฐํ๋ PDF ํ์ด์ง ๋ฐ์ดํฐ ๋ก๋๋จ")
|
681 |
+
else:
|
682 |
+
# ๊ธฐ์กด ๋ฐฉ์
|
683 |
+
truncated_content = content[:200] + "..." if len(content) > 200 else content
|
684 |
+
rag_context += f"--- ์ฒญํฌ {i+1} ---\n{truncated_content}\n\n"
|
685 |
+
|
686 |
+
# ๐ ๊ตฌ์กฐํ๋ ์ด๋ฏธ์ง ๋ฉํ๋ฐ์ดํฐ ์ฒ๋ฆฌ
|
687 |
+
if hasattr(doc, 'metadata') and doc.metadata:
|
688 |
+
metadata = doc.metadata
|
689 |
+
|
690 |
+
# ์ค๋ฌด ์์ค ๊ตฌ์กฐํ๋ ๋ฉํ๋ฐ์ดํฐ ํ์ธ
|
691 |
+
if metadata.get('structured_analysis') and metadata.get('spatial_relationships'):
|
692 |
+
print(f"๐ [DEBUG] ์ค๋ฌด ์์ค ๊ตฌ์กฐํ๋ ๋ฉํ๋ฐ์ดํฐ ๋ฐ๊ฒฌ")
|
693 |
+
|
694 |
+
# ์ด๋ฏธ์ง ๋ธ๋ก ์ ๋ณด ์ถ๋ ฅ
|
695 |
+
if 'image_blocks' in metadata:
|
696 |
+
image_blocks = metadata['image_blocks']
|
697 |
+
for ib in image_blocks:
|
698 |
+
print(f"๐ผ๏ธ [DEBUG] ์ด๋ฏธ์ง ๋ธ๋ก: {ib['block_id']}, "
|
699 |
+
f"์์น: ({ib['bbox']['x0']:.1f}, {ib['bbox']['y0']:.1f}), "
|
700 |
+
f"๊ด๋ จ ํ
์คํธ: {ib['related_text_count']}๊ฐ")
|
701 |
+
|
702 |
+
# ์ด๋ฏธ์ง ๋ฐ์ดํฐ ์ถ์ถ (๊ธฐ์กด ๋ฐฉ์ + ์๋ก์ด ๋ฐฉ์ ๋ชจ๋ ์ง์)
|
703 |
+
if metadata.get('multimodal_ready') and 'image_data_list' in metadata:
|
704 |
+
image_data_list = metadata['image_data_list']
|
705 |
+
if image_data_list and len(image_data_list) > 0:
|
706 |
+
extracted_images.extend(image_data_list)
|
707 |
+
print(f"๐ [DEBUG] ์ฒญํฌ {i+1}์์ ์ด๋ฏธ์ง {len(image_data_list)}๊ฐ ์ถ์ถ๋จ")
|
708 |
+
|
709 |
+
# ์ด๋ฏธ์ง ๋ฉํ๋ฐ์ดํฐ๋ ์ถ๋ ฅ (๊ตฌ์กฐํ๋ ๊ฒฝ์ฐ)
|
710 |
+
if 'image_metadata' in metadata:
|
711 |
+
for img_meta in metadata['image_metadata'][:2]:
|
712 |
+
print(f"๐ผ๏ธ [DEBUG] ์ด๋ฏธ์ง ์์ธ: {img_meta['block_id']}, "
|
713 |
+
f"ํฌ๊ธฐ: {img_meta['size']}, "
|
714 |
+
f"๊ด๋ จ ํ
์คํธ: {len(img_meta.get('related_texts', []))}๊ฐ")
|
715 |
|
716 |
if len(rag_context) > 30:
|
717 |
context_prompt += rag_context
|
718 |
print(f"๐ [DEBUG] RAG ์ปจํ
์คํธ ํฌํจ๋จ - ๊ธธ์ด: {len(rag_context)}")
|
719 |
print(f"๐ [DEBUG] RAG ์ปจํ
์คํธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {rag_context[:100]}...")
|
720 |
+
|
721 |
+
# ๐ ์ถ์ถ๋ ์ด๋ฏธ์ง๊ฐ ์์ผ๋ฉด ๋ก๊ทธ ์ถ๋ ฅ
|
722 |
+
if extracted_images:
|
723 |
+
print(f"๐ [DEBUG] ์ด {len(extracted_images)}๊ฐ ์ด๋ฏธ์ง ๋ฐ์ดํฐ ์ถ์ถ ์๋ฃ")
|
724 |
+
else:
|
725 |
+
print(f"๐ [DEBUG] ์ถ์ถ๋ ์ด๋ฏธ์ง ์์")
|
726 |
else:
|
727 |
print(f"โ ๏ธ [DEBUG] RAG ์ปจํ
์คํธ๊ฐ ๋๋ฌด ์งง์: {len(rag_context)}")
|
728 |
else:
|
|
|
835 |
# --- 3. ํ ํฌ๋์ด์ง ---
|
836 |
print(f"๐ [DEBUG] ํ ํฌ๋์ด์ง ์์")
|
837 |
t_tok_start = time.time()
|
838 |
+
if not all_image_data or len([img for img in all_image_data if img]) == 0:
|
839 |
# ํ
์คํธ-only ๊ณ ์ ๊ฒฝ๋ก (๋ ๋น ๋ฆ)
|
840 |
print(f"๐ [DEBUG] ํ
์คํธ-only ํ ํฌ๋์ด์ง ๊ฒฝ๋ก")
|
841 |
print(f"๐ [DEBUG] ์ฌ์ฉํ ํ๋กฌํํธ: {formatted_prompt}")
|
|
|
858 |
# ๋ฉํฐ๋ชจ๋ฌ(Lite): Kanana ์ ์ฉ encode_prompt๋ก -1 ํ ํฐ ์๋ฆฌ ์์ฑ (ํ์)
|
859 |
print(f"๐ [DEBUG] ๋ฉํฐ๋ชจ๋ฌ ํ ํฌ๋์ด์ง ๊ฒฝ๋ก")
|
860 |
print(f"๐ [DEBUG] combined_image_metas: {combined_image_metas}")
|
861 |
+
print(f"๐ [DEBUG] ์ด ์ด๋ฏธ์ง ๊ฐ์: {len(all_image_data)}")
|
862 |
|
863 |
if hasattr(tokenizer, 'encode_prompt'):
|
864 |
print(f"๐ [DEBUG] encode_prompt ๋ฉ์๋ ์ฌ์ฉ")
|
|
|
1457 |
total_time = time.time() - t_tok_start
|
1458 |
print(f"๐ [DEBUG] ์ ์ฒด ์ฒ๋ฆฌ ์๋ฃ - ์ด ์์์๊ฐ: {total_time:.3f}์ด")
|
1459 |
|
1460 |
+
# ๐ ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์๋ฃ (์ ์ญ ๋ณ์ ์ด๊ธฐํ๋ ์ ๊ฑฐ๋จ)
|
1461 |
+
|
1462 |
return {
|
1463 |
"generated_text": response,
|
1464 |
"processing_time": total_time,
|
|
|
1634 |
image2: UploadFile = File(None),
|
1635 |
image3: UploadFile = File(None),
|
1636 |
image4: UploadFile = File(None),
|
1637 |
+
user_id: str = Form("anonymous"),
|
1638 |
+
room_id: str = Form("default"),
|
1639 |
use_context: bool = Form(True),
|
1640 |
session_id: str = Form(None)):
|
1641 |
|
|
|
1644 |
|
1645 |
start_time = time.time()
|
1646 |
|
1647 |
+
# ์ธ์
ID๊ฐ ์์ผ๋ฉด ์๋ ์์ฑ (์ฑํ
๋ฐฉ๋ณ ๊ณ ์ ์ธ์
)
|
1648 |
if not session_id:
|
1649 |
+
# ์ฑํ
๋ฐฉ + ์ฌ์ฉ์ + ํ์์คํฌํ ๊ธฐ๋ฐ์ผ๋ก ๊ณ ์ ํ ์ธ์
์์ฑ
|
1650 |
+
timestamp = int(time.time())
|
1651 |
+
session_id = f"room_{room_id}_user_{user_id}_{timestamp}"
|
1652 |
+
print(f"๐ [DEBUG] ์๋ ์ธ์
ID ์์ฑ: {session_id} (์ฑํ
๋ฐฉ: {room_id}, ์ฌ์ฉ์: {user_id})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1653 |
|
1654 |
if use_context:
|
1655 |
context_manager.add_user_message(prompt, metadata={"session_id": session_id})
|
|
|
1667 |
|
1668 |
try:
|
1669 |
# generate_sync ํจ์ ํธ์ถ (์ปจํ
์คํธ ํฌํจ)
|
1670 |
+
result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id)
|
1671 |
|
1672 |
if "error" in result:
|
1673 |
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
1817 |
async def upload_document(
|
1818 |
file: UploadFile = File(...),
|
1819 |
user_id: str = Form("default_user"), # ๊ธฐ๋ณธ ์ฌ์ฉ์ ID
|
1820 |
+
room_id: str = Form("default"), # ์ฑํ
๋ฐฉ ID
|
1821 |
document_id: Optional[str] = Form(None) # ๋ฌธ์ ID (์๋ ์์ฑ ๊ฐ๋ฅ)
|
1822 |
):
|
1823 |
"""๋ฌธ์ ์
๋ก๋ ๋ฐ RAG ์ฒ๋ฆฌ"""
|
|
|
1848 |
processing_time = time.time() - start_time
|
1849 |
logger.info(f"๐ ๋ฌธ์ ์
๋ก๋ ์๋ฃ ({processing_time:.2f}์ด): {file.filename}")
|
1850 |
|
1851 |
+
# ์๋ก์ด ๋ฉ๋ชจ๋ฆฌ ์์คํ
์ ๋ฌธ์ ์ ๋ณด ์ถ๊ฐ
|
1852 |
if result["success"]:
|
1853 |
try:
|
1854 |
+
# ๋ฌธ์ ์ ๋ณด๋ฅผ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ์ ์ถ๊ฐ
|
1855 |
+
chunks = result.get("chunks", [])
|
1856 |
+
chunk_count = len(chunks) if isinstance(chunks, list) else 0
|
1857 |
|
1858 |
+
document_info = {
|
1859 |
+
"document_id": document_id,
|
1860 |
+
"filename": file.filename,
|
1861 |
+
"uploaded_by": user_id,
|
1862 |
+
"document_type": file.filename.split('.')[-1].lower() if '.' in file.filename else "unknown",
|
1863 |
+
"page_count": result.get("page_count", 0),
|
1864 |
+
"chunk_count": chunk_count,
|
1865 |
+
"summary": result.get("message", "")
|
1866 |
+
}
|
1867 |
+
|
1868 |
+
# ํตํฉ ๋ฉ๋ชจ๋ฆฌ ๊ด๋ฆฌ์์ ๋ฌธ์ ์ถ๊ฐ
|
1869 |
+
integrated_memory_manager.add_document_to_room(room_id, document_info)
|
1870 |
+
|
1871 |
+
# ์ฌ์ฉ์ ํต๊ณ ์
๋ฐ์ดํธ
|
1872 |
+
integrated_memory_manager.record_conversation(
|
1873 |
+
user_id, room_id,
|
1874 |
+
topic=f"๋ฌธ์ ์
๋ก๋: {file.filename}"
|
1875 |
+
)
|
1876 |
+
|
1877 |
+
logger.info(f"โ
๋ฉ๋ชจ๋ฆฌ ์์คํ
์ ๋ฌธ์ ์ ๋ณด ์ถ๊ฐ ์๋ฃ: {room_id} - {file.filename}")
|
1878 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1879 |
except Exception as e:
|
1880 |
+
logger.warning(f"โ ๏ธ ๋ฉ๋ชจ๋ฆฌ ์์คํ
์
๋ฐ์ดํธ ์คํจ: {e}")
|
1881 |
+
|
1882 |
+
# ๋ฌธ์ ์
๋ก๋ ํ ์๋ AI ์๋ต ์์ฑ ๋นํ์ฑํ (AI ๋ฆฌ์์ค ์ ์ฝ)
|
1883 |
+
# ์ฌ์ฉ์๊ฐ ์ง์ ์ง๋ฌธํ ๋๋ง AI ์๋ต ์์ฑ
|
1884 |
+
auto_generate_response = False
|
1885 |
+
|
1886 |
+
if result["success"]:
|
1887 |
+
# ์๋ AI ์์ฝ ์์ด ๋ฌธ์ ์
๋ก๋๋ง ์๋ฃ
|
1888 |
+
result["auto_response"] = f"๋ฌธ์ '{file.filename}' ์
๋ก๋ ์๋ฃ! ์ด์ ์ง๋ฌธํด์ฃผ์ธ์."
|
1889 |
+
logger.info(f"๐ ์๋ AI ์๋ต ์์ฑ ๊ฑด๋๋ฐ๊ธฐ - AI ๋ฆฌ์์ค ์ ์ฝ (์ฌ์ฉ์ ์ง๋ฌธ ์์๋ง AI ์๋ต)")
|
1890 |
+
else:
|
1891 |
+
result["auto_response"] = "๋ฌธ์ ์
๋ก๋์ ์คํจํ์ต๋๋ค."
|
1892 |
|
1893 |
return DocumentUploadResponse(
|
1894 |
success=result["success"],
|
lily_llm_core/document_processor.py
CHANGED
@@ -2,41 +2,34 @@
|
|
2 |
"""
|
3 |
๋ฌธ์ ์ฒ๋ฆฌ ๋ชจ๋
|
4 |
PDF, DOCX, PPTX ๋ฑ ๋ค์ํ ๋ฌธ์ ํ์์ ์ฒ๋ฆฌ
|
|
|
5 |
"""
|
6 |
|
7 |
import os
|
8 |
import logging
|
9 |
-
from typing import List, Dict, Any, Optional
|
10 |
from pathlib import Path
|
11 |
import easyocr
|
12 |
import re
|
13 |
import base64
|
14 |
import io
|
|
|
|
|
15 |
|
16 |
# LangChain ๋ฌธ์ ๋ก๋๋ค
|
17 |
try:
|
18 |
from langchain_community.document_loaders import (
|
19 |
PyMuPDFLoader,
|
20 |
UnstructuredWordDocumentLoader,
|
21 |
-
UnstructuredPowerPointLoader
|
22 |
-
UnstructuredFileLoader
|
23 |
)
|
24 |
-
|
25 |
-
|
26 |
-
logger.error("pymupdf ํจํค์ง๋ฅผ ์ค์นํด์ฃผ์ธ์: pip install pymupdf")
|
27 |
-
raise
|
28 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
29 |
-
from langchain.schema import Document
|
30 |
-
|
31 |
-
# OCR imports
|
32 |
-
try:
|
33 |
-
import easyocr
|
34 |
-
EASYOCR_AVAILABLE = True
|
35 |
except ImportError:
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
#
|
40 |
try:
|
41 |
from PIL import Image, ImageEnhance
|
42 |
PIL_AVAILABLE = True
|
@@ -47,48 +40,149 @@ except ImportError:
|
|
47 |
|
48 |
logger = logging.getLogger(__name__)
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
class DocumentProcessor:
|
51 |
"""๋ฌธ์ ์ฒ๋ฆฌ ํด๋์ค"""
|
52 |
|
53 |
def __init__(self, formula_ocr_engine: str = 'easyocr'):
|
54 |
"""
|
|
|
|
|
55 |
Args:
|
56 |
formula_ocr_engine: ์์ ์ถ์ถ ์์ง ('easyocr', 'mathpix', 'latexocr')
|
57 |
"""
|
58 |
self.formula_ocr_engine = formula_ocr_engine
|
59 |
-
self.
|
60 |
-
'.pdf': 'pdf',
|
61 |
-
'.docx': 'docx',
|
62 |
-
'.doc': 'doc',
|
63 |
-
'.pptx': 'pptx',
|
64 |
-
'.ppt': 'ppt',
|
65 |
-
'.txt': 'text'
|
66 |
-
}
|
67 |
|
68 |
-
#
|
69 |
-
self.
|
70 |
-
|
71 |
-
chunk_overlap=200,
|
72 |
-
length_function=len,
|
73 |
-
separators=["\n\n", "\n", " ", ""]
|
74 |
-
)
|
75 |
|
76 |
-
# OCR
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
79 |
self.ocr_reader = None
|
80 |
-
else:
|
81 |
-
try:
|
82 |
-
# ๋งค์ฐ ๊ธฐ๋ณธ์ ์ธ ์ค์ ์ผ๋ก ๋จ์ํ
|
83 |
-
self.ocr_reader = easyocr.Reader(
|
84 |
-
['ko', 'en'],
|
85 |
-
gpu=False,
|
86 |
-
verbose=True, # ๋๋ฒ๊น
์ ์ํด verbose ํ์ฑํ
|
87 |
-
)
|
88 |
-
logger.info("โ
OCR ๋ฆฌ๋ ์ด๊ธฐํ ์๋ฃ (๊ธฐ๋ณธ ์ค์ )")
|
89 |
-
except Exception as e:
|
90 |
-
logger.error(f"โ OCR ๋ฆฌ๋ ์ด๊ธฐํ ์คํจ: {e}")
|
91 |
-
self.ocr_reader = None
|
92 |
|
93 |
# ์์ ์ถ์ถ ์์ง ์ค์ (LaTeX-OCR ๋นํ์ฑํ๋จ)
|
94 |
if formula_ocr_engine in ['mathpix']: # 'latexocr' ์ ๊ฑฐ
|
@@ -101,345 +195,57 @@ class DocumentProcessor:
|
|
101 |
logger.warning(f"โ ๏ธ ์์ ์ถ์ถ ์์ง {formula_ocr_engine} ์ฌ์ฉ ๋ถ๊ฐ, EasyOCR๋ก ๋์ฒด")
|
102 |
else:
|
103 |
self.formula_extractor_available = False
|
104 |
-
|
105 |
-
def get_file_type(self, file_path: str) -> Optional[str]:
|
106 |
-
"""ํ์ผ ํ์
ํ์ธ"""
|
107 |
-
try:
|
108 |
-
# ํ์ผ ๊ฒฝ๋ก์์ ํ์ฅ์ ์ถ์ถ
|
109 |
-
file_path_str = str(file_path)
|
110 |
-
# Path ๊ฐ์ฒด๋ฅผ ์ฌ์ฉํ์ฌ ํ์ฅ์ ์ถ์ถ
|
111 |
-
path_obj = Path(file_path_str)
|
112 |
-
extension = path_obj.suffix.lower()
|
113 |
-
|
114 |
-
if extension:
|
115 |
-
# ๐ ํ์ฅ์ ์์ ์ (.) ์ ๊ฑฐํ์ฌ ๋ฐํ
|
116 |
-
file_type = extension[1:] if extension.startswith('.') else extension
|
117 |
-
logger.info(f"๐ ํ์ผ ํ์ฅ์ ์ธ์: {extension} -> {file_type}")
|
118 |
-
return file_type
|
119 |
-
else:
|
120 |
-
logger.warning(f"โ ๏ธ ํ์ผ ํ์ฅ์๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค: {file_path}")
|
121 |
-
return None
|
122 |
-
except Exception as e:
|
123 |
-
logger.error(f"โ ํ์ผ ํ์
ํ์ธ ์คํจ: {e}")
|
124 |
-
return None
|
125 |
-
|
126 |
-
def extract_text_from_image(self, image_data: bytes) -> str:
|
127 |
-
"""์ด๋ฏธ์ง์์ ํ
์คํธ ์ถ์ถ (OCR) - ์ํ ๊ธฐํธ ์ต์ ํ"""
|
128 |
-
if not self.ocr_reader:
|
129 |
-
logger.warning("โ ๏ธ OCR ๋ฆฌ๋๊ฐ ์ด๊ธฐํ๋์ง ์์์ต๋๋ค.")
|
130 |
-
return ""
|
131 |
|
132 |
-
|
133 |
-
import numpy as np
|
134 |
-
import io
|
135 |
-
|
136 |
-
# ์ด๋ฏธ์ง ๋ฐ์ดํฐ๋ฅผ PIL Image๋ก ๋ณํ
|
137 |
-
pil_image = Image.open(io.BytesIO(image_data))
|
138 |
-
logger.info(f"๐ธ ์ด๋ฏธ์ง ํฌ๊ธฐ: {pil_image.size}")
|
139 |
-
|
140 |
-
# ์ด๋ฏธ์ง ์ ์ฒ๋ฆฌ (์ํ ๊ธฐํธ ์ธ์ ๊ฐ์ )
|
141 |
-
if PIL_AVAILABLE:
|
142 |
-
pil_image = self.preprocess_image_for_math(pil_image)
|
143 |
-
logger.info(f"๐ธ ์ ์ฒ๋ฆฌ ํ ์ด๋ฏธ์ง ํฌ๊ธฐ: {pil_image.size}")
|
144 |
-
|
145 |
-
# PIL Image๋ฅผ numpy ๋ฐฐ์ด๋ก ๋ณํ
|
146 |
-
img_array = np.array(pil_image)
|
147 |
-
logger.info(f"๐ธ numpy ๋ฐฐ์ด ํํ: {img_array.shape}")
|
148 |
-
|
149 |
-
# OCR ์คํ (๊ธฐ๋ณธ ์ค์ ์ผ๋ก ๋จ์ํ)
|
150 |
-
logger.info("๐ OCR ์คํ ์์...")
|
151 |
-
results = self.ocr_reader.readtext(
|
152 |
-
img_array,
|
153 |
-
paragraph=True, # ๋จ๋ฝ ๋จ์๋ก ์ฒ๋ฆฌ
|
154 |
-
)
|
155 |
-
logger.info(f"๐ OCR ๊ฒฐ๊ณผ ๊ฐ์: {len(results)}")
|
156 |
-
|
157 |
-
# ์ถ์ถ๋ ํ
์คํธ ๊ฒฐํฉ (์ ๋ขฐ๋ ๊ธฐ๋ฐ ํํฐ๋ง)
|
158 |
-
extracted_text = ""
|
159 |
-
for i, result in enumerate(results):
|
160 |
-
try:
|
161 |
-
# ๊ฒฐ๊ณผ ํ์ ํ์ธ ๋ฐ ์์ ํ ์ฒ๋ฆฌ
|
162 |
-
if len(result) == 3:
|
163 |
-
bbox, text, confidence = result
|
164 |
-
elif len(result) == 2:
|
165 |
-
bbox, text = result
|
166 |
-
confidence = 0.5 # ๊ธฐ๋ณธ ์ ๋ขฐ๋
|
167 |
-
else:
|
168 |
-
logger.warning(f"โ ๏ธ ์์์น ๋ชปํ OCR ๊ฒฐ๊ณผ ํ์: {result}")
|
169 |
-
continue
|
170 |
-
|
171 |
-
logger.info(f"๐ ๊ฒฐ๊ณผ {i+1}: '{text}' (์ ๋ขฐ๋: {confidence:.2f})")
|
172 |
-
|
173 |
-
if confidence > 0.3: # ์ ๋ขฐ๋ ์๊ณ๊ฐ์ 30%๋ก ๋ฎ์ถค
|
174 |
-
# ์ํ ๊ธฐํธ ํ์ฒ๋ฆฌ
|
175 |
-
processed_text = self.post_process_math_symbols(text)
|
176 |
-
extracted_text += processed_text + " "
|
177 |
-
else:
|
178 |
-
logger.info(f"โ ๏ธ ์ ๋ขฐ๋ ๋ฎ์ ์ ์ธ: '{text}' (์ ๋ขฐ๋: {confidence:.2f})")
|
179 |
-
|
180 |
-
except Exception as e:
|
181 |
-
logger.warning(f"โ ๏ธ OCR ๊ฒฐ๊ณผ ์ฒ๋ฆฌ ์คํจ (๊ฒฐ๊ณผ {i+1}): {e}")
|
182 |
-
continue
|
183 |
-
|
184 |
-
# LaTeX ์์ ํจํด ๊ฐ์ง ๋ฐ ์ ๋ฆฌ
|
185 |
-
latex_patterns = [
|
186 |
-
r'\\[a-zA-Z]+', # LaTeX ๋ช
๋ น์ด
|
187 |
-
r'\\[a-zA-Z]+\{[^}]*\}', # LaTeX ๋ช
๋ น์ด + ์ธ์
|
188 |
-
r'\$[^$]+\$', # ์ธ๋ผ์ธ ์์
|
189 |
-
r'\$\$[^$]+\$\$', # ๋ธ๋ก ์์
|
190 |
-
r'\\begin\{[^}]*\}.*?\\end\{[^}]*\}', # ํ๊ฒฝ
|
191 |
-
]
|
192 |
-
|
193 |
-
latex_text = ""
|
194 |
-
for pattern in latex_patterns:
|
195 |
-
matches = re.findall(pattern, extracted_text)
|
196 |
-
if matches:
|
197 |
-
latex_text += " ".join(matches) + "\n"
|
198 |
-
|
199 |
-
# ์ต์ข
ํ
์คํธ ๊ตฌ์ฑ
|
200 |
-
final_text = extracted_text.strip()
|
201 |
-
if latex_text.strip():
|
202 |
-
final_text += f"\n\n[LaTeX ์์ ๊ฐ์ง]\n{latex_text.strip()}"
|
203 |
-
|
204 |
-
logger.info(f"โ
OCR ํ
์คํธ ์ถ์ถ ์๋ฃ: {len(final_text)}์")
|
205 |
-
if len(final_text) == 0:
|
206 |
-
logger.warning("โ ๏ธ OCR์์ ํ
์คํธ๋ฅผ ์ถ์ถํ์ง ๋ชปํ์ต๋๋ค.")
|
207 |
-
|
208 |
-
return final_text
|
209 |
-
|
210 |
-
except Exception as e:
|
211 |
-
logger.error(f"โ OCR ํ
์คํธ ์ถ์ถ ์คํจ: {e}")
|
212 |
-
return ""
|
213 |
|
214 |
-
def
|
215 |
-
"""
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
try:
|
221 |
-
# ๊ธฐ๋ณธ์ ์ธ ํฌ๊ธฐ ์กฐ์ ๋ง ์ํ
|
222 |
-
width, height = image.size
|
223 |
-
logger.debug(f"๐ธ ์๋ณธ ์ด๋ฏธ์ง ํฌ๊ธฐ: {width}x{height}")
|
224 |
-
|
225 |
-
# ๋๋ฌด ์์ ์ด๋ฏธ์ง๋ง ํ๋ (์์ ์ฑ ์ฐ์ )
|
226 |
-
if width < 1000 or height < 1000:
|
227 |
-
scale_factor = max(1000 / width, 1000 / height)
|
228 |
-
new_width = int(width * scale_factor)
|
229 |
-
new_height = int(height * scale_factor)
|
230 |
-
image = image.resize((new_width, new_height), Image.LANCZOS)
|
231 |
-
logger.debug(f"๐ธ ํ๋ ํ ์ด๋ฏธ์ง ํฌ๊ธฐ: {new_width}x{new_height}")
|
232 |
-
|
233 |
-
# ๊ธฐ๋ณธ์ ์ธ ๋๋น ๊ฐ์ ๋ง ์ํ
|
234 |
-
try:
|
235 |
-
enhancer = ImageEnhance.Contrast(image)
|
236 |
-
image = enhancer.enhance(1.1) # ๋๋น 10% ์ฆ๊ฐ (๋ฎ์ถค)
|
237 |
-
logger.debug("๐ธ ๋๋น ๊ฐ์ ์๋ฃ")
|
238 |
-
except Exception as e:
|
239 |
-
logger.debug(f"โ ๏ธ ๋๋น ๊ฐ์ ์คํจ: {e}")
|
240 |
-
|
241 |
-
return image
|
242 |
-
|
243 |
-
except Exception as e:
|
244 |
-
logger.warning(f"โ ๏ธ ์ด๋ฏธ์ง ์ ์ฒ๋ฆฌ ์คํจ: {e}")
|
245 |
-
return image
|
246 |
-
|
247 |
-
def post_process_math_symbols(self, text: str) -> str:
|
248 |
-
"""์ํ ๊ธฐํธ ํ์ฒ๋ฆฌ"""
|
249 |
-
if not text:
|
250 |
-
return text
|
251 |
-
|
252 |
-
# ์ํ ๊ธฐํธ ๋งคํ (OCR ์ค์ธ์ ๊ฐ์ )
|
253 |
-
math_symbols = {
|
254 |
-
'@': 'ร', # ๊ณฑํ๊ธฐ ๊ธฐํธ
|
255 |
-
'4': '=', # ๋ฑํธ
|
256 |
-
'๊ตฌ': 'f', # ํจ์ f
|
257 |
-
'B': 'ฮฒ', # ๋ฒ ํ
|
258 |
-
'A': 'ฮฑ', # ์ํ
|
259 |
-
'C': 'ฮณ', # ๊ฐ๋ง
|
260 |
-
'D': 'ฮด', # ๋ธํ
|
261 |
-
'E': 'ฮต', # ์ก์ค๋ก
|
262 |
-
'F': 'ฯ', # ํ์ด
|
263 |
-
'G': 'ฮณ', # ๊ฐ๋ง
|
264 |
-
'H': 'ฮท', # ์ํ
|
265 |
-
'I': 'ฮน', # ์ด์คํ
|
266 |
-
'K': 'ฮบ', # ์นดํ
|
267 |
-
'L': 'ฮป', # ๋๋ค
|
268 |
-
'M': 'ฮผ', # ๋ฎค
|
269 |
-
'N': 'ฮฝ', # ๋ด
|
270 |
-
'O': 'ฮฟ', # ์ค๋ฏธํฌ๋ก
|
271 |
-
'P': 'ฯ', # ํ์ด
|
272 |
-
'Q': 'ฮธ', # ์ธํ
|
273 |
-
'R': 'ฯ', # ๋ก
|
274 |
-
'S': 'ฯ', # ์๊ทธ๋ง
|
275 |
-
'T': 'ฯ', # ํ์ฐ
|
276 |
-
'U': 'ฯ
', # ์์ค๋ก
|
277 |
-
'V': 'ฯ', # ์ค๋ฉ๊ฐ
|
278 |
-
'W': 'ฯ', # ํ์ฌ์ด
|
279 |
-
'X': 'ฯ', # ์นด์ด
|
280 |
-
'Y': 'ฯ
', # ์์ค๋ก
|
281 |
-
'Z': 'ฮถ', # ์ ํ
|
282 |
-
}
|
283 |
-
|
284 |
-
# ์ํ ๊ธฐํธ ๊ต์ฒด
|
285 |
-
for wrong, correct in math_symbols.items():
|
286 |
-
text = text.replace(wrong, correct)
|
287 |
-
|
288 |
-
return text
|
289 |
-
|
290 |
-
def post_process_ocr_text(self, text: str) -> str:
|
291 |
-
"""OCR ๊ฒฐ๊ณผ ํ์ฒ๋ฆฌ - ์ํ ๊ธฐํธ ๊ฐ์ """
|
292 |
-
if not text:
|
293 |
-
return text
|
294 |
-
|
295 |
-
# ์ํ ๊ธฐํธ ๋งคํ (OCR ์ค์ธ์ ๊ฐ์ )
|
296 |
-
math_symbols = {
|
297 |
-
'@': 'ร', # ๊ณฑํ๊ธฐ ๊ธฐํธ
|
298 |
-
'4': '=', # ๋ฑํธ
|
299 |
-
'๊ตฌ': 'f', # ํจ์ f
|
300 |
-
'B': 'ฮฒ', # ๋ฒ ํ
|
301 |
-
'A': 'ฮฑ', # ์ํ
|
302 |
-
'C': 'ฮณ', # ๊ฐ๋ง
|
303 |
-
'D': 'ฮด', # ๋ธํ
|
304 |
-
'E': 'ฮต', # ์ก์ค๋ก
|
305 |
-
'F': 'ฯ', # ํ์ด
|
306 |
-
'G': 'ฮณ', # ๊ฐ๋ง
|
307 |
-
'H': 'ฮท', # ์ํ
|
308 |
-
'I': 'ฮน', # ์ด์คํ
|
309 |
-
'K': 'ฮบ', # ์นดํ
|
310 |
-
'L': 'ฮป', # ๋๋ค
|
311 |
-
'M': 'ฮผ', # ๋ฎค
|
312 |
-
'N': 'ฮฝ', # ๋ด
|
313 |
-
'O': 'ฮฟ', # ์ค๋ฏธํฌ๋ก
|
314 |
-
'P': 'ฯ', # ํ์ด
|
315 |
-
'Q': 'ฮธ', # ์ธํ
|
316 |
-
'R': 'ฯ', # ๋ก
|
317 |
-
'S': 'ฯ', # ์๊ทธ๋ง
|
318 |
-
'T': 'ฯ', # ํ์ฐ
|
319 |
-
'U': 'ฯ
', # ์์ค๋ก
|
320 |
-
'V': 'ฯ', # ์ค๋ฉ๊ฐ
|
321 |
-
'W': 'ฯ', # ํ์ฌ์ด
|
322 |
-
'X': 'ฯ', # ์นด์ด
|
323 |
-
'Y': 'ฯ
', # ์์ค๋ก
|
324 |
-
'Z': 'ฮถ', # ์ ํ
|
325 |
-
'0': 'ฮธ', # ์ธํ (์ซ์ 0๊ณผ ํผ๋)
|
326 |
-
'1': 'ฮน', # ์ด์คํ (์ซ์ 1๊ณผ ํผ๋)
|
327 |
-
'2': 'ฮถ', # ์ ํ (์ซ์ 2์ ํผ๋)
|
328 |
-
'3': 'ฮพ', # ํฌ์ (์ซ์ 3๊ณผ ํผ๋)
|
329 |
-
'5': 'ฯ', # ์๊ทธ๋ง (์ซ์ 5์ ํผ๋)
|
330 |
-
'6': 'ฯ', # ์๊ทธ๋ง (์ซ์ 6๊ณผ ํผ๋)
|
331 |
-
'7': 'ฮท', # ์ํ (์ซ์ 7๊ณผ ํผ๋)
|
332 |
-
'8': 'ฮธ', # ์ธํ (์ซ์ 8๊ณผ ํผ๋)
|
333 |
-
'9': 'ฮถ', # ์ ํ (์ซ์ 9์ ํผ๋)
|
334 |
-
}
|
335 |
-
|
336 |
-
# ์ํ ๊ธฐํธ ๊ต์ฒด
|
337 |
-
for wrong, correct in math_symbols.items():
|
338 |
-
text = text.replace(wrong, correct)
|
339 |
-
|
340 |
-
# LaTeX ์์ ํจํด ๊ฐ์ง ๋ฐ ๊ฐ์
|
341 |
-
latex_patterns = [
|
342 |
-
(r'f\s*\(\s*([^)]+)\s*\)', r'f(\1)'), # ํจ์ ํ๊ธฐ ์ ๋ฆฌ
|
343 |
-
(r'lim\s*([^โ]+)โ([^=]+)=', r'\\lim_{\1 \\to \2} ='), # ๊ทนํ ํ๊ธฐ
|
344 |
-
(r'โซ\s*([^d]+)d([^=]+)', r'\\int \1 d\2'), # ์ ๋ถ ํ๊ธฐ
|
345 |
-
(r'โ\s*([^=]+)=', r'\\sum \1 ='), # ํฉ ํ๊ธฐ
|
346 |
-
(r'ฯ', r'\\pi'), # ํ์ด
|
347 |
-
(r'โ', r'\\infty'), # ๋ฌดํ๋
|
348 |
-
(r'โ([^=]+)', r'\\sqrt{\1}'), # ์ ๊ณฑ๊ทผ
|
349 |
-
(r'([0-9]+)\^([0-9]+)', r'\1^{\\2}'), # ์ง์
|
350 |
-
(r'([0-9]+)/([0-9]+)', r'\\frac{\1}{\2}'), # ๋ถ์
|
351 |
-
]
|
352 |
-
|
353 |
-
for pattern, replacement in latex_patterns:
|
354 |
-
text = re.sub(pattern, replacement, text)
|
355 |
-
|
356 |
-
return text
|
357 |
|
358 |
def load_document(self, file_path: str) -> List[Document]:
|
359 |
-
"""๋ฌธ์ ๋ก๋
|
360 |
-
file_type = self.get_file_type(file_path)
|
361 |
-
if not file_type:
|
362 |
-
raise ValueError(f"์ง์ํ์ง ์๋ ํ์ผ ํ์: {file_path}")
|
363 |
-
|
364 |
-
documents = []
|
365 |
-
|
366 |
try:
|
|
|
|
|
367 |
if file_type == 'pdf':
|
368 |
-
|
369 |
-
import fitz # PyMuPDF
|
370 |
-
|
371 |
-
doc = fitz.open(file_path)
|
372 |
-
logger.info(f"๐ PDF ๋ฌธ์ ๋ก๋: {len(doc)} ํ์ด์ง")
|
373 |
-
|
374 |
-
for page_num in range(len(doc)):
|
375 |
-
page = doc.load_page(page_num)
|
376 |
-
|
377 |
-
# ํ
์คํธ ์ง์ ์ถ์ถ (OCR ๋์ )
|
378 |
-
page_text = page.get_text()
|
379 |
-
|
380 |
-
# ํ
์คํธ๊ฐ ๋น์ด์๊ฑฐ๋ ๋๋ฌด ์งง์ ๊ฒฝ์ฐ์๋ง OCR ์ฌ์ฉ
|
381 |
-
if not page_text.strip() or len(page_text.strip()) < 50:
|
382 |
-
logger.info(f"โ ๏ธ ํ์ด์ง {page_num + 1} ํ
์คํธ ์ถ์ถ ์คํจ, OCR ์ฌ์ฉ")
|
383 |
-
pix = page.get_pixmap(dpi=300)
|
384 |
-
img_data = pix.tobytes("png")
|
385 |
-
page_text = self.extract_text_from_image(img_data)
|
386 |
-
page_text = self.post_process_ocr_text(page_text)
|
387 |
-
|
388 |
-
# ๋ฉํ๋ฐ์ดํฐ ์ค์
|
389 |
-
metadata = {
|
390 |
-
'source': file_path,
|
391 |
-
'page': page_num + 1,
|
392 |
-
'file_type': 'pdf',
|
393 |
-
'processing_method': 'text_extraction' if page_text.strip() else 'ocr'
|
394 |
-
}
|
395 |
-
|
396 |
-
# Document ๊ฐ์ฒด ์์ฑ
|
397 |
-
documents.append(Document(
|
398 |
-
page_content=page_text,
|
399 |
-
metadata=metadata
|
400 |
-
))
|
401 |
-
|
402 |
-
logger.info(f"โ
ํ์ด์ง {page_num + 1} ์ฒ๋ฆฌ ์๋ฃ: {len(page_text)} ๋ฌธ์")
|
403 |
-
|
404 |
-
doc.close()
|
405 |
-
logger.info(f"โ
PDF ๋ฌธ์ ๋ก๋ ์๋ฃ: {len(documents)}๊ฐ ํ์ด์ง")
|
406 |
-
return documents
|
407 |
-
|
408 |
elif file_type == 'docx':
|
409 |
loader = UnstructuredWordDocumentLoader(file_path)
|
410 |
-
documents = loader.load()
|
411 |
-
logger.info(f"โ
DOCX ๋ฌธ์ ๋ก๋ ์๋ฃ: {len(documents)}๊ฐ ์ฒญํฌ")
|
412 |
-
|
413 |
elif file_type == 'pptx':
|
414 |
loader = UnstructuredPowerPointLoader(file_path)
|
415 |
-
documents = loader.load()
|
416 |
-
logger.info(f"โ
PPTX ๋ฌธ์ ๋ก๋ ์๋ฃ: {len(documents)}๊ฐ ์ฒญํฌ")
|
417 |
-
|
418 |
-
elif file_type == 'text':
|
419 |
-
loader = UnstructuredFileLoader(file_path)
|
420 |
-
documents = loader.load()
|
421 |
-
logger.info(f"โ
ํ
์คํธ ๋ฌธ์ ๋ก๋ ์๋ฃ: {len(documents)}๊ฐ ์ฒญํฌ")
|
422 |
-
|
423 |
else:
|
424 |
-
|
425 |
-
|
|
|
|
|
|
|
|
|
|
|
426 |
except Exception as e:
|
427 |
logger.error(f"โ ๋ฌธ์ ๋ก๋ ์คํจ: {e}")
|
428 |
-
|
429 |
-
|
430 |
-
return documents
|
431 |
|
432 |
-
def split_documents(self, documents: List[Document]) -> List[Document]:
|
433 |
"""๋ฌธ์๋ฅผ ์ฒญํฌ๋ก ๋ถํ """
|
434 |
-
logger.info(f"๐ ๋ฌธ์ ๋ถํ ์ค: {len(documents)}๊ฐ ๋ฌธ์")
|
435 |
-
|
436 |
try:
|
437 |
-
|
438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
return split_docs
|
|
|
440 |
except Exception as e:
|
441 |
logger.error(f"โ ๋ฌธ์ ๋ถํ ์คํจ: {e}")
|
442 |
-
|
443 |
|
444 |
def process_document(self, file_path: str) -> List[Document]:
|
445 |
"""
|
@@ -457,8 +263,12 @@ class DocumentProcessor:
|
|
457 |
file_type = self.get_file_type(file_path)
|
458 |
|
459 |
if file_type == 'pdf':
|
460 |
-
# PDF๋
|
461 |
-
|
|
|
|
|
|
|
|
|
462 |
else:
|
463 |
# ๋ค๋ฅธ ๋ฌธ์ ํ์์ ํ
์คํธ ๊ธฐ๋ฐ ์ฒ๋ฆฌ
|
464 |
documents = self.load_document(file_path)
|
@@ -488,156 +298,569 @@ class DocumentProcessor:
|
|
488 |
except Exception as e:
|
489 |
logger.error(f"โ ๋ฌธ์ ์ฒ๋ฆฌ ์คํจ: {e}")
|
490 |
return []
|
491 |
-
|
492 |
def _process_pdf_hybrid(self, pdf_path: str) -> List[Document]:
|
493 |
"""
|
494 |
-
PDF
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
List[Document]: ํ์ด๋ธ๋ฆฌ๋ ์ฒ๋ฆฌ๋ ๋ฌธ์ ์ฒญํฌ๋ค
|
502 |
"""
|
503 |
try:
|
504 |
import fitz # PyMuPDF
|
505 |
|
506 |
doc = fitz.open(pdf_path)
|
507 |
-
|
|
|
|
|
|
|
508 |
|
509 |
-
logger.info(f"
|
510 |
|
511 |
for page_num in range(len(doc)):
|
512 |
-
page = doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
-
#
|
515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
|
517 |
-
|
518 |
-
image_list = page.get_images()
|
519 |
-
page_images = []
|
520 |
|
521 |
-
|
522 |
-
|
523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
pix = fitz.Pixmap(doc, xref)
|
525 |
-
|
526 |
-
|
|
|
|
|
|
|
|
|
|
|
527 |
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
529 |
img_data = pix.tobytes("png")
|
530 |
-
img_pil = Image.open(io.BytesIO(img_data))
|
531 |
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
image_url = f"data:image/png;base64,{img_base64}"
|
538 |
-
|
539 |
-
page_images.append({
|
540 |
-
"index": img_index,
|
541 |
-
"image_url": image_url,
|
542 |
-
"size": img_pil.size
|
543 |
-
})
|
544 |
|
545 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
546 |
|
547 |
-
|
548 |
-
|
549 |
-
continue
|
550 |
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
|
559 |
-
|
560 |
-
|
|
|
561 |
|
562 |
-
|
|
|
|
|
|
|
563 |
|
564 |
-
#
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
573 |
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
|
|
584 |
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
logger.error(f"โ PDF ํ์ด๋ธ๋ฆฌ๋ ์ฒ๋ฆฌ ์คํจ: {e}")
|
605 |
-
# ์คํจ ์ ๊ธฐ์กด ํ
์คํธ ๊ธฐ๋ฐ ์ฒ๋ฆฌ๋ก ํด๋ฐฑ
|
606 |
-
logger.info("๐ ํ
์คํธ ๊ธฐ๋ฐ ์ฒ๋ฆฌ๋ก ํด๋ฐฑํฉ๋๋ค.")
|
607 |
-
return self.load_document(pdf_path)
|
608 |
-
|
609 |
def _is_valid_image(self, img: Image.Image) -> bool:
|
610 |
"""์ด๋ฏธ์ง ์ ํจ์ฑ ๊ฒ์ฌ"""
|
611 |
try:
|
|
|
|
|
612 |
# ์ต์/์ต๋ ํฌ๊ธฐ ํ์ธ
|
613 |
if img.size[0] < self.min_image_size[0] or img.size[1] < self.min_image_size[1]:
|
|
|
614 |
return False
|
615 |
if img.size[0] > self.max_image_size[0] or img.size[1] > self.max_image_size[1]:
|
|
|
616 |
return False
|
617 |
|
618 |
# ์ด๋ฏธ์ง ๋ชจ๋ ํ์ธ
|
619 |
if img.mode not in ['RGB', 'RGBA', 'L']:
|
|
|
620 |
return False
|
621 |
|
|
|
622 |
return True
|
623 |
-
except Exception:
|
|
|
624 |
return False
|
625 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
626 |
def _extract_formulas_from_documents(self, documents: List[Document]) -> List[Document]:
|
627 |
-
"""
|
628 |
-
|
629 |
-
"""
|
630 |
-
# ๊ธฐ์กด ๊ตฌํ ์ ์ง
|
631 |
return documents
|
632 |
|
633 |
def get_document_info(self, file_path: str) -> Dict[str, Any]:
|
634 |
-
"""๋ฌธ์ ์ ๋ณด
|
635 |
try:
|
636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
637 |
|
|
|
638 |
total_text = ""
|
639 |
for doc in documents:
|
640 |
-
|
|
|
641 |
|
642 |
return {
|
643 |
'file_path': file_path,
|
@@ -659,4 +882,4 @@ class DocumentProcessor:
|
|
659 |
document_processor = DocumentProcessor(formula_ocr_engine='latexocr')
|
660 |
# ํ์์ ๋ค๋ฅธ ์์ง์ผ๋ก ๋ณ๊ฒฝ ๊ฐ๋ฅ:
|
661 |
# document_processor = DocumentProcessor(formula_ocr_engine='easyocr') # EasyOCR ์ฌ์ฉ
|
662 |
-
# document_processor = DocumentProcessor(formula_ocr_engine='mathpix') # MathPix API ์ฌ์ฉ
|
|
|
2 |
"""
|
3 |
๋ฌธ์ ์ฒ๋ฆฌ ๋ชจ๋
|
4 |
PDF, DOCX, PPTX ๋ฑ ๋ค์ํ ๋ฌธ์ ํ์์ ์ฒ๋ฆฌ
|
5 |
+
์ค๋ฌด ์์ค PDF ๊ตฌ์กฐ ๋ถ์ + ๊ณต๊ฐ์ ๊ด๊ณ ๋งคํ ์ง์
|
6 |
"""
|
7 |
|
8 |
import os
|
9 |
import logging
|
10 |
+
from typing import List, Dict, Any, Optional, Tuple, NamedTuple
|
11 |
from pathlib import Path
|
12 |
import easyocr
|
13 |
import re
|
14 |
import base64
|
15 |
import io
|
16 |
+
import json
|
17 |
+
from dataclasses import dataclass, field
|
18 |
|
19 |
# LangChain ๋ฌธ์ ๋ก๋๋ค
|
20 |
try:
|
21 |
from langchain_community.document_loaders import (
|
22 |
PyMuPDFLoader,
|
23 |
UnstructuredWordDocumentLoader,
|
24 |
+
UnstructuredPowerPointLoader
|
|
|
25 |
)
|
26 |
+
from langchain.schema import Document
|
27 |
+
LANGCHAIN_AVAILABLE = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
except ImportError:
|
29 |
+
LANGCHAIN_AVAILABLE = False
|
30 |
+
Document = None
|
31 |
|
32 |
+
# PIL (Pillow) ์ด๋ฏธ์ง ์ฒ๋ฆฌ
|
33 |
try:
|
34 |
from PIL import Image, ImageEnhance
|
35 |
PIL_AVAILABLE = True
|
|
|
40 |
|
41 |
logger = logging.getLogger(__name__)
|
42 |
|
43 |
+
# ์ค๋ฌด ์์ค PDF ๊ตฌ์กฐ ๋ถ์์ ์ํ ๋ฐ์ดํฐ ํด๋์ค๋ค
|
44 |
+
@dataclass
|
45 |
+
class BoundingBox:
|
46 |
+
"""๋ฐ์ด๋ฉ ๋ฐ์ค (x0, y0, x1, y1)"""
|
47 |
+
x0: float
|
48 |
+
y0: float
|
49 |
+
x1: float
|
50 |
+
y1: float
|
51 |
+
|
52 |
+
@property
|
53 |
+
def width(self) -> float:
|
54 |
+
return self.x1 - self.x0
|
55 |
+
|
56 |
+
@property
|
57 |
+
def height(self) -> float:
|
58 |
+
return self.y1 - self.y0
|
59 |
+
|
60 |
+
@property
|
61 |
+
def center(self) -> Tuple[float, float]:
|
62 |
+
return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
|
63 |
+
|
64 |
+
def overlaps_with(self, other: 'BoundingBox') -> bool:
|
65 |
+
"""๋ค๋ฅธ ๋ฐ์ค์ ๊ฒน์น๋์ง ํ์ธ"""
|
66 |
+
return not (self.x1 < other.x0 or other.x1 < self.x0 or
|
67 |
+
self.y1 < other.y0 or other.y1 < self.y0)
|
68 |
+
|
69 |
+
def distance_to(self, other: 'BoundingBox') -> float:
|
70 |
+
"""๋ค๋ฅธ ๋ฐ์ค์์ ๊ฑฐ๋ฆฌ (์ค์ฌ์ ๊ธฐ์ค)"""
|
71 |
+
cx1, cy1 = self.center
|
72 |
+
cx2, cy2 = other.center
|
73 |
+
return ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
|
74 |
+
|
75 |
+
@dataclass
|
76 |
+
class PDFBlock:
|
77 |
+
"""PDF์ ํ
์คํธ/์ด๋ฏธ์ง ๋ธ๋ก"""
|
78 |
+
block_id: str
|
79 |
+
block_type: str # 'text', 'image', 'table', 'figure'
|
80 |
+
bbox: BoundingBox
|
81 |
+
content: Any
|
82 |
+
page_num: int
|
83 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
84 |
+
|
85 |
+
def is_near(self, other: 'PDFBlock', threshold: float = 50.0) -> bool:
|
86 |
+
"""๋ค๋ฅธ ๋ธ๋ก๊ณผ ๊ฐ๊น์ด์ง ํ์ธ (ํฝ์
๋จ์)"""
|
87 |
+
return self.bbox.distance_to(other.bbox) <= threshold
|
88 |
+
|
89 |
+
def is_above(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
|
90 |
+
"""๋ค๋ฅธ ๋ธ๋ก ์์ ์๋์ง ํ์ธ"""
|
91 |
+
return self.bbox.y1 <= other.bbox.y0 + threshold
|
92 |
+
|
93 |
+
def is_below(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
|
94 |
+
"""๋ค๋ฅธ ๋ธ๋ก ์๋์ ์๋์ง ํ์ธ"""
|
95 |
+
return self.bbox.y0 >= other.bbox.y1 - threshold
|
96 |
+
|
97 |
+
def is_left_of(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
|
98 |
+
"""๋ค๋ฅธ ๋ธ๋ก ์ผ์ชฝ์ ์๋์ง ํ์ธ"""
|
99 |
+
return self.bbox.x1 <= other.bbox.x0 + threshold
|
100 |
+
|
101 |
+
def is_right_of(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
|
102 |
+
"""๋ค๋ฅธ ๋ธ๋ก ์ค๋ฅธ์ชฝ์ ์๋์ง ํ์ธ"""
|
103 |
+
return self.bbox.x0 >= other.bbox.x1 - threshold
|
104 |
+
|
105 |
+
@dataclass
|
106 |
+
class PDFPage:
|
107 |
+
"""PDF ํ์ด์ง ๊ตฌ์กฐ"""
|
108 |
+
page_num: int
|
109 |
+
width: float
|
110 |
+
height: float
|
111 |
+
blocks: List[PDFBlock] = field(default_factory=list)
|
112 |
+
|
113 |
+
def get_blocks_by_type(self, block_type: str) -> List[PDFBlock]:
|
114 |
+
"""ํน์ ํ์
์ ๋ธ๋ก๋ค ๋ฐํ"""
|
115 |
+
return [block for block in self.blocks if block.block_type == block_type]
|
116 |
+
|
117 |
+
def find_related_blocks(self, target_block: PDFBlock,
|
118 |
+
relation_types: List[str] = None) -> List[Tuple[PDFBlock, str]]:
|
119 |
+
"""๊ด๋ จ๋ ๋ธ๋ก๋ค๊ณผ ๊ด๊ณ ํ์
๋ฐํ"""
|
120 |
+
if relation_types is None:
|
121 |
+
relation_types = ['near', 'above', 'below', 'left', 'right']
|
122 |
+
|
123 |
+
related = []
|
124 |
+
for block in self.blocks:
|
125 |
+
if block.block_id == target_block.block_id:
|
126 |
+
continue
|
127 |
+
|
128 |
+
for relation in relation_types:
|
129 |
+
if relation == 'near' and target_block.is_near(block):
|
130 |
+
related.append((block, 'near'))
|
131 |
+
elif relation == 'above' and target_block.is_above(block):
|
132 |
+
related.append((block, 'above'))
|
133 |
+
elif relation == 'below' and target_block.is_below(block):
|
134 |
+
related.append((block, 'below'))
|
135 |
+
elif relation == 'left' and target_block.is_left_of(block):
|
136 |
+
related.append((block, 'left'))
|
137 |
+
elif relation == 'right' and target_block.is_right_of(block):
|
138 |
+
related.append((block, 'right'))
|
139 |
+
|
140 |
+
return related
|
141 |
+
|
142 |
+
@dataclass
|
143 |
+
class PDFStructure:
|
144 |
+
"""PDF ์ ์ฒด ๊ตฌ์กฐ"""
|
145 |
+
pages: List[PDFPage] = field(default_factory=list)
|
146 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
147 |
+
|
148 |
+
def get_all_blocks(self) -> List[PDFBlock]:
|
149 |
+
"""๋ชจ๋ ๋ธ๋ก ๋ฐํ"""
|
150 |
+
all_blocks = []
|
151 |
+
for page in self.pages:
|
152 |
+
all_blocks.extend(page.blocks)
|
153 |
+
return all_blocks
|
154 |
+
|
155 |
+
def get_blocks_by_type(self, block_type: str) -> List[PDFBlock]:
|
156 |
+
"""์ ์ฒด ๋ฌธ์์์ ํน์ ํ์
์ ๋ธ๋ก๋ค ๋ฐํ"""
|
157 |
+
blocks = []
|
158 |
+
for page in self.pages:
|
159 |
+
blocks.extend(page.get_blocks_by_type(block_type))
|
160 |
+
return blocks
|
161 |
+
|
162 |
class DocumentProcessor:
|
163 |
"""๋ฌธ์ ์ฒ๋ฆฌ ํด๋์ค"""
|
164 |
|
165 |
def __init__(self, formula_ocr_engine: str = 'easyocr'):
|
166 |
"""
|
167 |
+
๋ฌธ์ ์ฒ๋ฆฌ๊ธฐ ์ด๊ธฐํ
|
168 |
+
|
169 |
Args:
|
170 |
formula_ocr_engine: ์์ ์ถ์ถ ์์ง ('easyocr', 'mathpix', 'latexocr')
|
171 |
"""
|
172 |
self.formula_ocr_engine = formula_ocr_engine
|
173 |
+
self.formula_extractor_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
# ์ด๋ฏธ์ง ํฌ๊ธฐ ์ ํ ์ค์
|
176 |
+
self.min_image_size = (10, 10) # ์ต์ ์ด๋ฏธ์ง ํฌ๊ธฐ (๋๋ฌด ์์ ์ด๋ฏธ์ง ์ ์ธ)
|
177 |
+
self.max_image_size = (10000, 10000) # ์ต๋ ์ด๋ฏธ์ง ํฌ๊ธฐ (๋๋ฌด ํฐ ์ด๋ฏธ์ง ์ ์ธ)
|
|
|
|
|
|
|
|
|
178 |
|
179 |
+
# OCR ์์ง ์ด๊ธฐํ
|
180 |
+
try:
|
181 |
+
self.ocr_reader = easyocr.Reader(['ko', 'en'], gpu=False)
|
182 |
+
logger.info("โ
EasyOCR ์ด๊ธฐํ ์๋ฃ")
|
183 |
+
except Exception as e:
|
184 |
+
logger.warning(f"โ ๏ธ EasyOCR ์ด๊ธฐํ ์คํจ: {e}")
|
185 |
self.ocr_reader = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
# ์์ ์ถ์ถ ์์ง ์ค์ (LaTeX-OCR ๋นํ์ฑํ๋จ)
|
188 |
if formula_ocr_engine in ['mathpix']: # 'latexocr' ์ ๊ฑฐ
|
|
|
195 |
logger.warning(f"โ ๏ธ ์์ ์ถ์ถ ์์ง {formula_ocr_engine} ์ฌ์ฉ ๋ถ๊ฐ, EasyOCR๋ก ๋์ฒด")
|
196 |
else:
|
197 |
self.formula_extractor_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
+
logger.info(f"๐ DocumentProcessor ์ด๊ธฐํ ์๋ฃ (OCR: {'EasyOCR' if self.ocr_reader else 'None'}, ์์: {formula_ocr_engine})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
+
def get_file_type(self, file_path: str) -> str:
|
202 |
+
"""ํ์ผ ํ์ฅ์ ๊ธฐ๋ฐ ํ์ผ ํ์
๋ฐํ"""
|
203 |
+
ext = Path(file_path).suffix.lower()
|
204 |
+
# ํ์ฅ์์์ ์ ์ ๊ฑฐ
|
205 |
+
return ext[1:] if ext.startswith('.') else ext
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
def load_document(self, file_path: str) -> List[Document]:
|
208 |
+
"""๋ฌธ์ ๋ก๋ (๊ธฐ๋ณธ ๋ฐฉ์)"""
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
try:
|
210 |
+
file_type = self.get_file_type(file_path)
|
211 |
+
|
212 |
if file_type == 'pdf':
|
213 |
+
loader = PyMuPDFLoader(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
elif file_type == 'docx':
|
215 |
loader = UnstructuredWordDocumentLoader(file_path)
|
|
|
|
|
|
|
216 |
elif file_type == 'pptx':
|
217 |
loader = UnstructuredPowerPointLoader(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
else:
|
219 |
+
logger.warning(f"โ ๏ธ ์ง์ํ์ง ์๋ ํ์ผ ํ์: {file_type}")
|
220 |
+
return []
|
221 |
+
|
222 |
+
documents = loader.load()
|
223 |
+
logger.info(f"๐ ๋ฌธ์ ๋ก๋ ์๋ฃ: {len(documents)}๊ฐ ์ฒญํฌ")
|
224 |
+
return documents
|
225 |
+
|
226 |
except Exception as e:
|
227 |
logger.error(f"โ ๋ฌธ์ ๋ก๋ ์คํจ: {e}")
|
228 |
+
return []
|
|
|
|
|
229 |
|
230 |
+
def split_documents(self, documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
|
231 |
"""๋ฌธ์๋ฅผ ์ฒญํฌ๋ก ๋ถํ """
|
|
|
|
|
232 |
try:
|
233 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
234 |
+
|
235 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
236 |
+
chunk_size=chunk_size,
|
237 |
+
chunk_overlap=chunk_overlap,
|
238 |
+
length_function=len,
|
239 |
+
separators=["\n\n", "\n", " ", ""]
|
240 |
+
)
|
241 |
+
|
242 |
+
split_docs = text_splitter.split_documents(documents)
|
243 |
+
logger.info(f"๐ ๋ฌธ์ ๋ถํ ์๋ฃ: {len(documents)}๊ฐ โ {len(split_docs)}๊ฐ ์ฒญํฌ")
|
244 |
return split_docs
|
245 |
+
|
246 |
except Exception as e:
|
247 |
logger.error(f"โ ๋ฌธ์ ๋ถํ ์คํจ: {e}")
|
248 |
+
return documents
|
249 |
|
250 |
def process_document(self, file_path: str) -> List[Document]:
|
251 |
"""
|
|
|
263 |
file_type = self.get_file_type(file_path)
|
264 |
|
265 |
if file_type == 'pdf':
|
266 |
+
# PDF๋ ์ค๋ฌด ์์ค ๊ตฌ์กฐ ๋ถ์์ผ๋ก ์ฒ๋ฆฌ
|
267 |
+
pdf_structure = self._process_pdf_with_structure_analysis(file_path)
|
268 |
+
# PDFStructure๋ฅผ Document ๊ฐ์ฒด๋ก ๋ณํ
|
269 |
+
documents = self._convert_structure_to_documents(pdf_structure)
|
270 |
+
logger.info(f"โ
์ค๋ฌด ์๏ฟฝ๏ฟฝ๏ฟฝ PDF ์ฒ๋ฆฌ ์๋ฃ: {len(documents)}๊ฐ Document ์์ฑ")
|
271 |
+
return documents
|
272 |
else:
|
273 |
# ๋ค๋ฅธ ๋ฌธ์ ํ์์ ํ
์คํธ ๊ธฐ๋ฐ ์ฒ๋ฆฌ
|
274 |
documents = self.load_document(file_path)
|
|
|
298 |
except Exception as e:
|
299 |
logger.error(f"โ ๋ฌธ์ ์ฒ๋ฆฌ ์คํจ: {e}")
|
300 |
return []
|
301 |
+
|
302 |
def _process_pdf_hybrid(self, pdf_path: str) -> List[Document]:
|
303 |
"""
|
304 |
+
์ค๋ฌด ์์ค PDF ์ฒ๋ฆฌ (๊ตฌ์กฐ ๋ถ์ + ๊ณต๊ฐ์ ๊ด๊ณ ๋งคํ)
|
305 |
+
"""
|
306 |
+
return self._process_pdf_with_structure_analysis(pdf_path)
|
307 |
+
|
308 |
+
def _process_pdf_with_structure_analysis(self, pdf_path: str) -> List[Document]:
|
309 |
+
"""
|
310 |
+
์ค๋ฌด ์์ค PDF ์ฒ๋ฆฌ (๊ตฌ์กฐ ๋ถ์ + ๊ณต๊ฐ์ ๊ด๊ณ ๋งคํ)
|
|
|
311 |
"""
|
312 |
try:
|
313 |
import fitz # PyMuPDF
|
314 |
|
315 |
doc = fitz.open(pdf_path)
|
316 |
+
pdf_structure = PDFStructure()
|
317 |
+
|
318 |
+
# PDF ๊ฒฝ๋ก ์ ๋ณด๋ฅผ ๋ฉํ๋ฐ์ดํฐ์ ์ถ๊ฐ
|
319 |
+
pdf_structure.metadata["source"] = pdf_path
|
320 |
|
321 |
+
logger.info(f"๐ PDF ๊ตฌ์กฐ ๋ถ์ ์์: {len(doc)}ํ์ด์ง")
|
322 |
|
323 |
for page_num in range(len(doc)):
|
324 |
+
page = doc[page_num]
|
325 |
+
page_rect = page.rect
|
326 |
+
|
327 |
+
pdf_page = PDFPage(
|
328 |
+
page_num=page_num + 1,
|
329 |
+
width=page_rect.width,
|
330 |
+
height=page_rect.height
|
331 |
+
)
|
332 |
+
|
333 |
+
# 1. ํ
์คํธ ๋ธ๋ก ์ถ์ถ (์์น ์ ๋ณด ํฌํจ)
|
334 |
+
text_blocks = self._extract_text_blocks(page, page_num)
|
335 |
+
pdf_page.blocks.extend(text_blocks)
|
336 |
+
|
337 |
+
# 2. ์ด๋ฏธ์ง ๋ธ๋ก ์ถ์ถ (์์น ์ ๋ณด ํฌํจ)
|
338 |
+
image_blocks = self._extract_image_blocks(page, page_num, doc)
|
339 |
+
pdf_page.blocks.extend(image_blocks)
|
340 |
+
|
341 |
+
pdf_structure.pages.append(pdf_page)
|
342 |
+
|
343 |
+
logger.info(f"๐ ํ์ด์ง {page_num + 1} ๋ถ์ ์๋ฃ: "
|
344 |
+
f"ํ
์คํธ ๋ธ๋ก {len(text_blocks)}๊ฐ, "
|
345 |
+
f"์ด๋ฏธ์ง ๋ธ๋ก {len(image_blocks)}๊ฐ")
|
346 |
+
|
347 |
+
doc.close()
|
348 |
+
|
349 |
+
# 3. ๋ธ๋ก ๊ฐ ๊ด๊ณ ๋ถ์
|
350 |
+
self._analyze_block_relationships(pdf_structure)
|
351 |
+
|
352 |
+
logger.info(f"โ
PDF ๊ตฌ์กฐ ๋ถ์ ์๋ฃ: ์ด {len(pdf_structure.get_all_blocks())}๊ฐ ๋ธ๋ก")
|
353 |
+
return pdf_structure
|
354 |
+
|
355 |
+
except Exception as e:
|
356 |
+
logger.error(f"โ PDF ๊ตฌ์กฐ ๋ถ์ ์คํจ: {e}")
|
357 |
+
import traceback
|
358 |
+
traceback.print_exc()
|
359 |
+
return PDFStructure()
|
360 |
+
|
361 |
+
def _extract_text_blocks(self, page, page_num: int) -> List[PDFBlock]:
|
362 |
+
"""ํ์ด์ง์์ ํ
์คํธ ๋ธ๋ก ์ถ์ถ"""
|
363 |
+
text_blocks = []
|
364 |
+
|
365 |
+
try:
|
366 |
+
# PyMuPDF์ get_text("dict") ์ฌ์ฉ - ๊ฐ์ฅ ์ ํํ ์์น ์ ๋ณด ์ ๊ณต
|
367 |
+
text_dict = page.get_text("dict")
|
368 |
+
|
369 |
+
for block_idx, block in enumerate(text_dict["blocks"]):
|
370 |
+
if "lines" not in block: # ์ด๋ฏธ์ง ๋ธ๋ก์ ๊ฑด๋๋ฐ๊ธฐ
|
371 |
+
continue
|
372 |
|
373 |
+
# ๋ธ๋ก์ ๋ฐ์ด๋ฉ ๋ฐ์ค
|
374 |
+
bbox = BoundingBox(
|
375 |
+
x0=block["bbox"][0],
|
376 |
+
y0=block["bbox"][1],
|
377 |
+
x1=block["bbox"][2],
|
378 |
+
y1=block["bbox"][3]
|
379 |
+
)
|
380 |
+
|
381 |
+
# ํ
์คํธ ๋ด์ฉ ์ถ์ถ
|
382 |
+
text_content = ""
|
383 |
+
for line in block["lines"]:
|
384 |
+
for span in line["spans"]:
|
385 |
+
text_content += span["text"]
|
386 |
+
text_content += "\n"
|
387 |
|
388 |
+
text_content = text_content.strip()
|
|
|
|
|
389 |
|
390 |
+
if text_content: # ๋น ํ
์คํธ ๋ธ๋ก์ ์ ์ธ
|
391 |
+
text_block = PDFBlock(
|
392 |
+
block_id=f"page_{page_num + 1}_text_{block_idx}",
|
393 |
+
block_type="text",
|
394 |
+
bbox=bbox,
|
395 |
+
content=text_content,
|
396 |
+
page_num=page_num + 1,
|
397 |
+
metadata={
|
398 |
+
"font_info": self._extract_font_info(block),
|
399 |
+
"word_count": len(text_content.split()),
|
400 |
+
"char_count": len(text_content)
|
401 |
+
}
|
402 |
+
)
|
403 |
+
text_blocks.append(text_block)
|
404 |
+
|
405 |
+
except Exception as e:
|
406 |
+
logger.warning(f"โ ๏ธ ํ์ด์ง {page_num + 1} ํ
์คํธ ๋ธ๋ก ์ถ์ถ ์คํจ: {e}")
|
407 |
+
|
408 |
+
return text_blocks
|
409 |
+
|
410 |
+
def _extract_image_blocks(self, page, page_num: int, doc=None) -> List[PDFBlock]:
|
411 |
+
"""ํ์ด์ง์์ ์ด๋ฏธ์ง ๋ธ๋ก ์ถ์ถ"""
|
412 |
+
image_blocks = []
|
413 |
+
|
414 |
+
try:
|
415 |
+
import fitz # PyMuPDF
|
416 |
+
|
417 |
+
# 1. ์๋ฒ ๋๋ ์ด๋ฏธ์ง ์ถ์ถ
|
418 |
+
images = page.get_images()
|
419 |
+
|
420 |
+
for img_idx, img_info in enumerate(images):
|
421 |
+
try:
|
422 |
+
# ์ด๋ฏธ์ง ์ถ์ถ - doc ๊ฐ์ฒด๋ฅผ ์ง์ ์ฌ์ฉ
|
423 |
+
xref = img_info[0]
|
424 |
+
if doc:
|
425 |
pix = fitz.Pixmap(doc, xref)
|
426 |
+
else:
|
427 |
+
# fallback: ํ์ด์ง์์ ์ง์ ์ถ์ถ ์๋
|
428 |
+
pix = page.get_pixmap()
|
429 |
+
continue # ์ด ๊ฒฝ์ฐ๋ ๊ฑด๋๋ฐ๊ธฐ
|
430 |
+
|
431 |
+
if pix.n - pix.alpha < 4: # GRAY or RGB
|
432 |
+
if pix.colorspace and pix.colorspace.n > 3:
|
433 |
pix = fitz.Pixmap(fitz.csRGB, pix)
|
434 |
+
|
435 |
+
img_data = pix.tobytes("png")
|
436 |
+
img_pil = Image.open(io.BytesIO(img_data))
|
437 |
+
|
438 |
+
if self._is_valid_image(img_pil):
|
439 |
+
# ์ด๋ฏธ์ง์ ์์น ์ ๋ณด ์ถ์ถ (์ค์!)
|
440 |
+
img_rect = self._get_image_rect(page, xref)
|
441 |
|
442 |
+
if img_rect:
|
443 |
+
bbox = BoundingBox(
|
444 |
+
x0=img_rect.x0,
|
445 |
+
y0=img_rect.y0,
|
446 |
+
x1=img_rect.x1,
|
447 |
+
y1=img_rect.y1
|
448 |
+
)
|
449 |
+
|
450 |
+
image_block = PDFBlock(
|
451 |
+
block_id=f"page_{page_num + 1}_image_{img_idx}",
|
452 |
+
block_type="image",
|
453 |
+
bbox=bbox,
|
454 |
+
content=img_data, # ๋ฐ์ด๋๋ฆฌ ๋ฐ์ดํฐ
|
455 |
+
page_num=page_num + 1,
|
456 |
+
metadata={
|
457 |
+
"image_size": img_pil.size,
|
458 |
+
"image_format": "PNG",
|
459 |
+
"image_mode": img_pil.mode,
|
460 |
+
"xref": xref,
|
461 |
+
"is_embedded": True
|
462 |
+
}
|
463 |
+
)
|
464 |
+
image_blocks.append(image_block)
|
465 |
+
|
466 |
+
logger.debug(f"๐ผ๏ธ ์ด๋ฏธ์ง ๋ธ๋ก ์ถ์ถ: ํ์ด์ง {page_num + 1}, "
|
467 |
+
f"์์น ({bbox.x0:.1f}, {bbox.y0:.1f}, {bbox.x1:.1f}, {bbox.y1:.1f}), "
|
468 |
+
f"ํฌ๊ธฐ {img_pil.size}")
|
469 |
+
|
470 |
+
pix = None
|
471 |
+
|
472 |
+
except Exception as e:
|
473 |
+
logger.warning(f"โ ๏ธ ์ด๋ฏธ์ง {img_idx} ์ฒ๋ฆฌ ์คํจ: {e}")
|
474 |
+
|
475 |
+
# 2. ์ด๋ฏธ์ง๊ฐ ์์ผ๋ฉด ์ ์ฒด ํ์ด์ง ๋ ๋๋ง (fallback)
|
476 |
+
if not image_blocks:
|
477 |
+
# ํ์ฌ ๋ชจ๋ธ์ด ๋ฉํฐ๋ชจ๋ฌ์ธ์ง ํ์ธ
|
478 |
+
try:
|
479 |
+
from lily_llm_api.app_v2 import current_profile
|
480 |
+
is_multimodal = hasattr(current_profile, 'multimodal') and current_profile.multimodal
|
481 |
+
|
482 |
+
if is_multimodal:
|
483 |
+
# ์ ์ฒด ํ์ด์ง๋ฅผ ์ด๋ฏธ์ง๋ก ๋ ๋๋ง
|
484 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2๋ฐฐ ํด์๋
|
485 |
img_data = pix.tobytes("png")
|
|
|
486 |
|
487 |
+
bbox = BoundingBox(
|
488 |
+
x0=0, y0=0,
|
489 |
+
x1=page.rect.width,
|
490 |
+
y1=page.rect.height
|
491 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
493 |
+
image_block = PDFBlock(
|
494 |
+
block_id=f"page_{page_num + 1}_fullpage",
|
495 |
+
block_type="image",
|
496 |
+
bbox=bbox,
|
497 |
+
content=img_data,
|
498 |
+
page_num=page_num + 1,
|
499 |
+
metadata={
|
500 |
+
"image_size": (pix.width, pix.height),
|
501 |
+
"image_format": "PNG",
|
502 |
+
"is_embedded": False,
|
503 |
+
"is_full_page_render": True
|
504 |
+
}
|
505 |
+
)
|
506 |
+
image_blocks.append(image_block)
|
507 |
|
508 |
+
logger.debug(f"๐ ์ ์ฒด ํ์ด์ง ๋ ๋๋ง: ํ์ด์ง {page_num + 1}")
|
509 |
+
pix = None
|
|
|
510 |
|
511 |
+
except ImportError:
|
512 |
+
pass # app_v2 import ์คํจ ์ ๋ฌด์
|
513 |
+
|
514 |
+
except Exception as e:
|
515 |
+
logger.warning(f"โ ๏ธ ํ์ด์ง {page_num + 1} ์ด๋ฏธ์ง ๋ธ๋ก ์ถ์ถ ์คํจ: {e}")
|
516 |
+
|
517 |
+
return image_blocks
|
518 |
+
|
519 |
+
def _get_image_rect(self, page, xref: int) -> Optional[Any]:
|
520 |
+
"""์ด๋ฏธ์ง์ ์ค์ ์์น(rect) ์ฐพ๊ธฐ"""
|
521 |
+
try:
|
522 |
+
import fitz # PyMuPDF
|
523 |
+
|
524 |
+
# ํ์ด์ง์ ๋ชจ๋ ์ด๋ฏธ์ง ์ฐธ์กฐ์์ ์์น ์ฐพ๊ธฐ
|
525 |
+
for item in page.get_images(full=True):
|
526 |
+
if item[0] == xref: # xref๊ฐ ์ผ์นํ๋ฉด
|
527 |
+
# ์ด๋ฏธ์ง๊ฐ ์ฌ์ฉ๋ ์์น ์ฐพ๊ธฐ
|
528 |
+
image_list = page.get_image_info()
|
529 |
+
for img_info in image_list:
|
530 |
+
if img_info["xref"] == xref:
|
531 |
+
return fitz.Rect(img_info["bbox"])
|
532 |
+
|
533 |
+
# ๋์: ํ์ด์ง ๋ด์ฉ์์ ์ด๋ฏธ์ง ์์น ๊ฒ์
|
534 |
+
blocks = page.get_text("dict")["blocks"]
|
535 |
+
for block in blocks:
|
536 |
+
if block.get("type") == 1: # ์ด๋ฏธ์ง ๋ธ๋ก
|
537 |
+
if block.get("xref") == xref:
|
538 |
+
return fitz.Rect(block["bbox"])
|
539 |
+
|
540 |
+
return None
|
541 |
+
|
542 |
+
except Exception as e:
|
543 |
+
logger.debug(f"โ ๏ธ ์ด๋ฏธ์ง ์์น ์ฐพ๊ธฐ ์คํจ: {e}")
|
544 |
+
return None
|
545 |
+
|
546 |
+
def _extract_font_info(self, block: Dict) -> Dict[str, Any]:
|
547 |
+
"""ํ
์คํธ ๋ธ๋ก์์ ํฐํธ ์ ๋ณด ์ถ์ถ"""
|
548 |
+
font_info = {
|
549 |
+
"fonts": [],
|
550 |
+
"sizes": [],
|
551 |
+
"flags": []
|
552 |
+
}
|
553 |
+
|
554 |
+
try:
|
555 |
+
for line in block.get("lines", []):
|
556 |
+
for span in line.get("spans", []):
|
557 |
+
font_info["fonts"].append(span.get("font", ""))
|
558 |
+
font_info["sizes"].append(span.get("size", 0))
|
559 |
+
font_info["flags"].append(span.get("flags", 0))
|
560 |
+
|
561 |
+
# ์ค๋ณต ์ ๊ฑฐ
|
562 |
+
font_info["fonts"] = list(set(font_info["fonts"]))
|
563 |
+
font_info["sizes"] = list(set(font_info["sizes"]))
|
564 |
+
font_info["flags"] = list(set(font_info["flags"]))
|
565 |
+
|
566 |
+
except Exception as e:
|
567 |
+
logger.debug(f"โ ๏ธ ํฐํธ ์ ๋ณด ์ถ์ถ ์คํจ: {e}")
|
568 |
+
|
569 |
+
return font_info
|
570 |
+
|
571 |
+
def _analyze_block_relationships(self, pdf_structure: PDFStructure):
|
572 |
+
"""๋ธ๋ก ๊ฐ ๊ด๊ณ ๋ถ์ ๋ฐ ๋ฉํ๋ฐ์ดํฐ์ ์ ์ฅ"""
|
573 |
+
|
574 |
+
for page in pdf_structure.pages:
|
575 |
+
text_blocks = page.get_blocks_by_type("text")
|
576 |
+
image_blocks = page.get_blocks_by_type("image")
|
577 |
+
|
578 |
+
# ๊ฐ ์ด๋ฏธ์ง ๋ธ๋ก์ ๋ํด ๊ด๋ จ๋ ํ
์คํธ ๋ธ๋ก ์ฐพ๊ธฐ
|
579 |
+
for image_block in image_blocks:
|
580 |
+
related_texts = []
|
581 |
+
|
582 |
+
for text_block in text_blocks:
|
583 |
+
# ๊ณต๊ฐ์ ๊ด๊ณ ๋ถ์
|
584 |
+
if image_block.is_near(text_block, threshold=100): # 100ํฝ์
์ด๋ด
|
585 |
+
relation = self._determine_spatial_relationship(image_block, text_block)
|
586 |
+
related_texts.append({
|
587 |
+
"block_id": text_block.block_id,
|
588 |
+
"relationship": relation,
|
589 |
+
"distance": image_block.bbox.distance_to(text_block.bbox),
|
590 |
+
"content_preview": text_block.content[:100] + "..." if len(text_block.content) > 100 else text_block.content
|
591 |
+
})
|
592 |
+
|
593 |
+
# ๊ฑฐ๋ฆฌ์์ผ๋ก ์ ๋ ฌ
|
594 |
+
related_texts.sort(key=lambda x: x["distance"])
|
595 |
+
image_block.metadata["related_texts"] = related_texts[:3] # ๊ฐ์ฅ ๊ฐ๊น์ด 3๊ฐ๋ง ์ ์ฅ
|
596 |
+
|
597 |
+
logger.debug(f"๐ ์ด๋ฏธ์ง ๋ธ๋ก {image_block.block_id}์ "
|
598 |
+
f"{len(related_texts)}๊ฐ ํ
์คํธ ๋ธ๋ก ์ฐ๊ฒฐ")
|
599 |
+
|
600 |
+
def _determine_spatial_relationship(self, block1: PDFBlock, block2: PDFBlock) -> str:
|
601 |
+
"""๋ ๋ธ๋ก ๊ฐ์ ๊ณต๊ฐ์ ๊ด๊ณ ๊ฒฐ์ """
|
602 |
+
if block1.is_above(block2):
|
603 |
+
return "above"
|
604 |
+
elif block1.is_below(block2):
|
605 |
+
return "below"
|
606 |
+
elif block1.is_left_of(block2):
|
607 |
+
return "left"
|
608 |
+
elif block1.is_right_of(block2):
|
609 |
+
return "right"
|
610 |
+
else:
|
611 |
+
return "near"
|
612 |
+
|
613 |
+
def _convert_structure_to_documents(self, pdf_structure: PDFStructure) -> List[Document]:
|
614 |
+
"""
|
615 |
+
PDF ๊ตฌ์กฐ๋ฅผ Document ๊ฐ์ฒด๋ก ๋ณํ (์ค๋ฌด ์์ค - ๊ณต๊ฐ์ ๊ด๊ณ ํฌํจ)
|
616 |
+
"""
|
617 |
+
documents = []
|
618 |
+
|
619 |
+
try:
|
620 |
+
for page in pdf_structure.pages:
|
621 |
+
text_blocks = page.get_blocks_by_type("text")
|
622 |
+
image_blocks = page.get_blocks_by_type("image")
|
623 |
+
|
624 |
+
if not text_blocks and not image_blocks:
|
625 |
+
continue # ๋น ํ์ด์ง ๊ฑด๋๋ฐ๊ธฐ
|
626 |
+
|
627 |
+
# ํ์ด์ง๋ณ ๊ตฌ์กฐํ๋ ์ปจํ
์ธ ์์ฑ
|
628 |
+
page_content = self._build_structured_content(page, text_blocks, image_blocks)
|
629 |
+
|
630 |
+
# ์ด๋ฏธ์ง ๋ฐ์ดํฐ ์ค๋น (๋ฉํฐ๋ชจ๋ฌ ์ฒ๋ฆฌ์ฉ)
|
631 |
+
image_data_list = []
|
632 |
+
image_metadata = []
|
633 |
+
|
634 |
+
for img_block in image_blocks:
|
635 |
+
image_data_list.append(img_block.content) # ๋ฐ์ด๋๋ฆฌ ๋ฐ์ดํฐ
|
636 |
+
image_metadata.append({
|
637 |
+
"block_id": img_block.block_id,
|
638 |
+
"bbox": {
|
639 |
+
"x0": img_block.bbox.x0, "y0": img_block.bbox.y0,
|
640 |
+
"x1": img_block.bbox.x1, "y1": img_block.bbox.y1
|
641 |
+
},
|
642 |
+
"size": img_block.metadata.get("image_size"),
|
643 |
+
"related_texts": img_block.metadata.get("related_texts", [])
|
644 |
+
})
|
645 |
+
|
646 |
+
# Document ๋ฉํ๋ฐ์ดํฐ ๊ตฌ์ฑ
|
647 |
+
metadata = {
|
648 |
+
"source": pdf_structure.metadata.get("source", "unknown"),
|
649 |
+
"page": page.page_num,
|
650 |
+
"total_pages": len(pdf_structure.pages),
|
651 |
+
"has_text": len(text_blocks) > 0,
|
652 |
+
"has_images": len(image_blocks) > 0,
|
653 |
+
"text_block_count": len(text_blocks),
|
654 |
+
"image_block_count": len(image_blocks),
|
655 |
+
"page_width": page.width,
|
656 |
+
"page_height": page.height,
|
657 |
|
658 |
+
# ์ค๋ฌด ์์ค ๋ฉํ๋ฐ์ดํฐ
|
659 |
+
"structured_analysis": True,
|
660 |
+
"spatial_relationships": True,
|
661 |
|
662 |
+
# ๋ฉํฐ๋ชจ๋ฌ ์ฒ๋ฆฌ์ฉ ๋ฐ์ดํฐ
|
663 |
+
"multimodal_ready": len(image_blocks) > 0,
|
664 |
+
"image_data_list": image_data_list,
|
665 |
+
"image_metadata": image_metadata,
|
666 |
|
667 |
+
# ๋ธ๋ก ๊ตฌ์กฐ ์ ๋ณด
|
668 |
+
"text_blocks": [
|
669 |
+
{
|
670 |
+
"block_id": tb.block_id,
|
671 |
+
"bbox": {"x0": tb.bbox.x0, "y0": tb.bbox.y0, "x1": tb.bbox.x1, "y1": tb.bbox.y1},
|
672 |
+
"word_count": tb.metadata.get("word_count", 0),
|
673 |
+
"content_preview": tb.content[:50] + "..." if len(tb.content) > 50 else tb.content
|
674 |
+
}
|
675 |
+
for tb in text_blocks
|
676 |
+
],
|
677 |
+
"image_blocks": [
|
678 |
+
{
|
679 |
+
"block_id": ib.block_id,
|
680 |
+
"bbox": {"x0": ib.bbox.x0, "y0": ib.bbox.y0, "x1": ib.bbox.x1, "y1": ib.bbox.y1},
|
681 |
+
"size": ib.metadata.get("image_size"),
|
682 |
+
"related_text_count": len(ib.metadata.get("related_texts", []))
|
683 |
+
}
|
684 |
+
for ib in image_blocks
|
685 |
+
]
|
686 |
+
}
|
687 |
+
|
688 |
+
# Document ๊ฐ์ฒด ์์ฑ
|
689 |
+
doc = Document(
|
690 |
+
page_content=page_content,
|
691 |
+
metadata=metadata
|
692 |
+
)
|
693 |
+
documents.append(doc)
|
694 |
+
|
695 |
+
logger.info(f"๐ ํ์ด์ง {page.page_num} Document ์์ฑ: "
|
696 |
+
f"ํ
์คํธ ๋ธ๋ก {len(text_blocks)}๊ฐ, ์ด๋ฏธ์ง ๋ธ๋ก {len(image_blocks)}๊ฐ")
|
697 |
+
|
698 |
+
except Exception as e:
|
699 |
+
logger.error(f"โ PDF ๊ตฌ์กฐ โ Document ๋ณํ ์คํจ: {e}")
|
700 |
+
import traceback
|
701 |
+
traceback.print_exc()
|
702 |
+
|
703 |
+
return documents
|
704 |
+
|
705 |
+
def _build_structured_content(self, page: PDFPage, text_blocks: List[PDFBlock],
|
706 |
+
image_blocks: List[PDFBlock]) -> str:
|
707 |
+
"""
|
708 |
+
ํ์ด์ง์ ๊ตฌ์กฐํ๋ ์ปจํ
์ธ ์์ฑ (๊ณต๊ฐ์ ๊ด๊ณ ๊ธฐ๋ฐ)
|
709 |
+
"""
|
710 |
+
content_parts = []
|
711 |
+
|
712 |
+
# ํ์ด์ง ํค๋
|
713 |
+
content_parts.append(f"=== ํ์ด์ง {page.page_num} ===")
|
714 |
+
|
715 |
+
# ํ
์คํธ ๋ธ๋ก๋ค์ Y ์ขํ ์์ผ๋ก ์ ๋ ฌ (์์์ ์๋๋ก)
|
716 |
+
sorted_text_blocks = sorted(text_blocks, key=lambda b: b.bbox.y0)
|
717 |
+
|
718 |
+
# ์ด๋ฏธ์ง-ํ
์คํธ ๊ด๊ณ๋ฅผ ๊ณ ๋ คํ ์ปจํ
์ธ ๊ตฌ์ฑ
|
719 |
+
processed_images = set()
|
720 |
+
|
721 |
+
for text_block in sorted_text_blocks:
|
722 |
+
# 1. ํ
์คํธ ๋ธ๋ก ์ถ๊ฐ
|
723 |
+
content_parts.append(f"\n[ํ
์คํธ ๋ธ๋ก {text_block.block_id}]")
|
724 |
+
content_parts.append(text_block.content)
|
725 |
+
|
726 |
+
# 2. ์ด ํ
์คํธ ๋ธ๋ก๊ณผ ๊ด๋ จ๋ ์ด๋ฏธ์ง ์ฐพ๊ธฐ
|
727 |
+
related_images = []
|
728 |
+
for img_block in image_blocks:
|
729 |
+
if img_block.block_id in processed_images:
|
730 |
+
continue
|
731 |
|
732 |
+
# ๊ณต๊ฐ์ ์ผ๋ก ๊ฐ๊น์ด ์ด๋ฏธ์ง ์ฐพ๊ธฐ
|
733 |
+
if text_block.is_near(img_block, threshold=150):
|
734 |
+
related_images.append(img_block)
|
735 |
+
|
736 |
+
# 3. ๊ด๋ จ๋ ์ด๋ฏธ์ง๋ค์ ๊ฑฐ๋ฆฌ์์ผ๋ก ์ ๋ ฌํ์ฌ ์ถ๊ฐ
|
737 |
+
if related_images:
|
738 |
+
related_images.sort(key=lambda img: text_block.bbox.distance_to(img.bbox))
|
739 |
+
|
740 |
+
for img_block in related_images[:2]: # ์ต๋ 2๊ฐ ์ด๋ฏธ์ง๋ง
|
741 |
+
relationship = self._determine_spatial_relationship(text_block, img_block)
|
742 |
+
content_parts.append(f"\n[์ด๋ฏธ์ง {img_block.block_id} - {relationship} ๊ด๊ณ]")
|
743 |
+
content_parts.append(f"์ด๋ฏธ์ง ํฌ๊ธฐ: {img_block.metadata.get('image_size', 'unknown')}")
|
744 |
|
745 |
+
# ๊ด๋ จ ํ
์คํธ ์ ๋ณด ์ถ๊ฐ
|
746 |
+
related_texts = img_block.metadata.get("related_texts", [])
|
747 |
+
if related_texts:
|
748 |
+
content_parts.append("๊ด๋ จ ํ
์คํธ:")
|
749 |
+
for rt in related_texts[:2]: # ์ต๋ 2๊ฐ๋ง
|
750 |
+
content_parts.append(f" - {rt['relationship']}: {rt['content_preview']}")
|
751 |
|
752 |
+
processed_images.add(img_block.block_id)
|
753 |
+
|
754 |
+
# 4. ์ฒ๋ฆฌ๋์ง ์์ ์ด๋ฏธ์ง๋ค ์ถ๊ฐ (๋
๋ฆฝ์ ์ธ ์ด๋ฏธ์ง๋ค)
|
755 |
+
unprocessed_images = [img for img in image_blocks if img.block_id not in processed_images]
|
756 |
+
if unprocessed_images:
|
757 |
+
content_parts.append(f"\n[๋
๋ฆฝ์ ์ธ ์ด๋ฏธ์ง๋ค]")
|
758 |
+
for img_block in unprocessed_images:
|
759 |
+
content_parts.append(f"\n[์ด๋ฏธ์ง {img_block.block_id}]")
|
760 |
+
content_parts.append(f"์ด๋ฏธ์ง ํฌ๊ธฐ: {img_block.metadata.get('image_size', 'unknown')}")
|
761 |
+
content_parts.append(f"์์น: ({img_block.bbox.x0:.1f}, {img_block.bbox.y0:.1f})")
|
762 |
+
|
763 |
+
# 5. ์์ฝ ์ ๋ณด ์ถ๊ฐ
|
764 |
+
content_parts.append(f"\n[ํ์ด์ง {page.page_num} ์์ฝ]")
|
765 |
+
content_parts.append(f"ํ
์คํธ ๋ธ๋ก: {len(text_blocks)}๊ฐ")
|
766 |
+
content_parts.append(f"์ด๋ฏธ์ง ๋ธ๋ก: {len(image_blocks)}๊ฐ")
|
767 |
+
content_parts.append(f"ํ์ด์ง ํฌ๊ธฐ: {page.width:.1f} x {page.height:.1f}")
|
768 |
+
|
769 |
+
return "\n".join(content_parts)
|
770 |
+
|
|
|
|
|
|
|
|
|
|
|
771 |
def _is_valid_image(self, img: Image.Image) -> bool:
|
772 |
"""์ด๋ฏธ์ง ์ ํจ์ฑ ๊ฒ์ฌ"""
|
773 |
try:
|
774 |
+
logger.debug(f"๐ ์ด๋ฏธ์ง ์ ํจ์ฑ ๊ฒ์ฌ: ํฌ๊ธฐ={img.size}, ๋ชจ๋={img.mode}")
|
775 |
+
|
776 |
# ์ต์/์ต๋ ํฌ๊ธฐ ํ์ธ
|
777 |
if img.size[0] < self.min_image_size[0] or img.size[1] < self.min_image_size[1]:
|
778 |
+
logger.debug(f"โ ์ด๋ฏธ์ง ํฌ๊ธฐ๊ฐ ๋๋ฌด ์์: {img.size} < {self.min_image_size}")
|
779 |
return False
|
780 |
if img.size[0] > self.max_image_size[0] or img.size[1] > self.max_image_size[1]:
|
781 |
+
logger.debug(f"โ ์ด๋ฏธ์ง ํฌ๊ธฐ๊ฐ ๋๋ฌด ํผ: {img.size} > {self.max_image_size}")
|
782 |
return False
|
783 |
|
784 |
# ์ด๋ฏธ์ง ๋ชจ๋ ํ์ธ
|
785 |
if img.mode not in ['RGB', 'RGBA', 'L']:
|
786 |
+
logger.debug(f"โ ์ง์ํ์ง ์๋ ์ด๋ฏธ์ง ๋ชจ๋: {img.mode}")
|
787 |
return False
|
788 |
|
789 |
+
logger.debug(f"โ
์ด๋ฏธ์ง ์ ํจ์ฑ ๊ฒ์ฌ ํต๊ณผ: {img.size}, {img.mode}")
|
790 |
return True
|
791 |
+
except Exception as e:
|
792 |
+
logger.debug(f"โ ์ด๋ฏธ์ง ์ ํจ์ฑ ๊ฒ์ฌ ์คํจ: {e}")
|
793 |
return False
|
794 |
|
795 |
+
def _extract_text_from_image(self, img: Image.Image) -> str:
|
796 |
+
"""์ด๋ฏธ์ง์์ ํ
์คํธ ์ถ์ถ (OCR)"""
|
797 |
+
try:
|
798 |
+
# EasyOCR ์ฐ์ ์๋
|
799 |
+
if self.ocr_reader:
|
800 |
+
try:
|
801 |
+
result = self.ocr_reader.readtext(img)
|
802 |
+
if result:
|
803 |
+
text = " ".join([item[1] for item in result])
|
804 |
+
logger.info(f"โ
EasyOCR๋ก ํ
์คํธ ์ถ์ถ ์ฑ๊ณต: {len(text)}์")
|
805 |
+
return text
|
806 |
+
except Exception as e:
|
807 |
+
logger.warning(f"โ ๏ธ EasyOCR ์คํจ: {e}")
|
808 |
+
|
809 |
+
# Tesseract fallback
|
810 |
+
try:
|
811 |
+
import pytesseract
|
812 |
+
# ๊ทธ๋ ์ด์ค์ผ์ผ ๋ณํ (Tesseract ์ต์ ํ)
|
813 |
+
if img.mode != 'L':
|
814 |
+
img_gray = img.convert('L')
|
815 |
+
else:
|
816 |
+
img_gray = img
|
817 |
+
|
818 |
+
text = pytesseract.image_to_string(img_gray, lang='kor+eng')
|
819 |
+
if text and text.strip():
|
820 |
+
logger.info(f"โ
Tesseract๋ก ํ
์คํธ ์ถ์ถ ์ฑ๊ณต: {len(text.strip())}์")
|
821 |
+
return text.strip()
|
822 |
+
else:
|
823 |
+
logger.info("๐ Tesseract OCR ๊ฒฐ๊ณผ ์์")
|
824 |
+
return ""
|
825 |
+
|
826 |
+
except ImportError:
|
827 |
+
logger.warning("โ ๏ธ pytesseract๊ฐ ์ค์น๋์ง ์์")
|
828 |
+
return ""
|
829 |
+
except Exception as e:
|
830 |
+
logger.warning(f"โ ๏ธ Tesseract OCR ์คํจ: {e}")
|
831 |
+
return ""
|
832 |
+
|
833 |
+
except Exception as e:
|
834 |
+
logger.error(f"โ ์ด๋ฏธ์ง OCR ์ ์ฒด ์คํจ: {e}")
|
835 |
+
return ""
|
836 |
+
|
837 |
def _extract_formulas_from_documents(self, documents: List[Document]) -> List[Document]:
|
838 |
+
"""๋ฌธ์์์ ์์ ์ถ์ถ (ํ์ฌ ๋นํ์ฑํ๋จ)"""
|
839 |
+
# ์์ ์ถ์ถ ๊ธฐ๋ฅ์ ํ์ฌ ๋นํ์ฑํ
|
|
|
|
|
840 |
return documents
|
841 |
|
842 |
def get_document_info(self, file_path: str) -> Dict[str, Any]:
|
843 |
+
"""๋ฌธ์ ์ ๋ณด ์กฐํ"""
|
844 |
try:
|
845 |
+
if not os.path.exists(file_path):
|
846 |
+
return {'supported': False, 'error': 'ํ์ผ์ด ์กด์ฌํ์ง ์์ต๋๋ค.'}
|
847 |
+
|
848 |
+
file_type = self.get_file_type(file_path)
|
849 |
+
|
850 |
+
if file_type not in ['pdf', 'docx', 'pptx']:
|
851 |
+
return {'supported': False, 'error': f'์ง์ํ์ง ์๋ ํ์ผ ํ์: {file_type}'}
|
852 |
+
|
853 |
+
# ๋ฌธ์ ๋ก๋ ์๋
|
854 |
+
documents = self.process_document(file_path)
|
855 |
+
|
856 |
+
if not documents:
|
857 |
+
return {'supported': False, 'error': '๋ฌธ์ ๋ด์ฉ์ ์ถ์ถํ ์ ์์ต๋๋ค.'}
|
858 |
|
859 |
+
# ํต๊ณ ์ ๋ณด ์์ง
|
860 |
total_text = ""
|
861 |
for doc in documents:
|
862 |
+
if hasattr(doc, 'page_content'):
|
863 |
+
total_text += doc.page_content + " "
|
864 |
|
865 |
return {
|
866 |
'file_path': file_path,
|
|
|
882 |
document_processor = DocumentProcessor(formula_ocr_engine='latexocr')
|
883 |
# ํ์์ ๋ค๋ฅธ ์์ง์ผ๋ก ๋ณ๊ฒฝ ๊ฐ๋ฅ:
|
884 |
# document_processor = DocumentProcessor(formula_ocr_engine='easyocr') # EasyOCR ์ฌ์ฉ
|
885 |
+
# document_processor = DocumentProcessor(formula_ocr_engine='mathpix') # MathPix API ์ฌ์ฉ
|
lily_llm_core/integrated_memory_manager.py
ADDED
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
ํตํฉ ๋ฉ๋ชจ๋ฆฌ ๊ด๋ฆฌ์ (Integrated Memory Manager)
|
4 |
+
๊ณ์ธต์ ๋ฉ๋ชจ๋ฆฌ ์์คํ
์ ํตํฉ ๊ด๋ฆฌํ๋ ์ค์ ๊ด๋ฆฌ์
|
5 |
+
"""
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import time
|
9 |
+
from typing import Dict, Any, List, Optional, Tuple
|
10 |
+
from dataclasses import dataclass
|
11 |
+
|
12 |
+
from .user_memory_manager import user_memory_manager, UserMemory
|
13 |
+
from .room_context_manager import room_context_manager, RoomContext
|
14 |
+
from .context_manager import AdvancedContextManager
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
@dataclass
|
19 |
+
class MemoryContext:
|
20 |
+
"""๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ํตํฉ ๋ฐ์ดํฐ"""
|
21 |
+
# ์ฌ์ฉ์ ์ ๋ณด
|
22 |
+
user_id: str
|
23 |
+
user_memory: UserMemory
|
24 |
+
|
25 |
+
# ์ฑํ
๋ฐฉ ์ ๋ณด
|
26 |
+
room_id: str
|
27 |
+
room_context: RoomContext
|
28 |
+
|
29 |
+
# ์ธ์
์ ๋ณด
|
30 |
+
session_id: str
|
31 |
+
|
32 |
+
# ํตํฉ ์ปจํ
์คํธ
|
33 |
+
combined_context: str = ""
|
34 |
+
memory_summary: str = ""
|
35 |
+
|
36 |
+
class IntegratedMemoryManager:
|
37 |
+
"""๊ณ์ธต์ ๋ฉ๋ชจ๋ฆฌ ์์คํ
ํตํฉ ๊ด๋ฆฌ์"""
|
38 |
+
|
39 |
+
def __init__(self):
|
40 |
+
self.user_memory_manager = user_memory_manager
|
41 |
+
self.room_context_manager = room_context_manager
|
42 |
+
|
43 |
+
# ์ธ์
๋ณ ์ปจํ
์คํธ ๊ด๋ฆฌ์
|
44 |
+
self.session_context_managers: Dict[str, AdvancedContextManager] = {}
|
45 |
+
|
46 |
+
logger.info("๐ IntegratedMemoryManager ์ด๊ธฐํ ์๋ฃ")
|
47 |
+
|
48 |
+
def get_memory_context(self, user_id: str, room_id: str, session_id: str) -> MemoryContext:
|
49 |
+
"""ํตํฉ ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ์กฐํ"""
|
50 |
+
try:
|
51 |
+
# 1. ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์กฐํ
|
52 |
+
user_memory = self.user_memory_manager.get_user_memory(user_id)
|
53 |
+
|
54 |
+
# 2. ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์กฐํ
|
55 |
+
room_context = self.room_context_manager.get_room_context(room_id)
|
56 |
+
|
57 |
+
# 3. ์ธ์
๋ณ ์ปจํ
์คํธ ๊ด๋ฆฌ์ ํ์ธ/์์ฑ
|
58 |
+
if session_id not in self.session_context_managers:
|
59 |
+
self.session_context_managers[session_id] = AdvancedContextManager(
|
60 |
+
max_tokens=2000,
|
61 |
+
max_turns=20,
|
62 |
+
strategy="sliding_window"
|
63 |
+
)
|
64 |
+
|
65 |
+
# 4. ํตํฉ ์ปจํ
์คํธ ์์ฑ
|
66 |
+
combined_context = self._create_combined_context(
|
67 |
+
user_memory, room_context, session_id
|
68 |
+
)
|
69 |
+
|
70 |
+
# 5. ๋ฉ๋ชจ๋ฆฌ ์์ฝ ์์ฑ
|
71 |
+
memory_summary = self._create_memory_summary(
|
72 |
+
user_memory, room_context
|
73 |
+
)
|
74 |
+
|
75 |
+
return MemoryContext(
|
76 |
+
user_id=user_id,
|
77 |
+
user_memory=user_memory,
|
78 |
+
room_id=room_id,
|
79 |
+
room_context=room_context,
|
80 |
+
session_id=session_id,
|
81 |
+
combined_context=combined_context,
|
82 |
+
memory_summary=memory_summary
|
83 |
+
)
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
logger.error(f"โ ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ์กฐํ ์คํจ: {user_id}/{room_id}/{session_id} - {e}")
|
87 |
+
# ๊ธฐ๋ณธ๊ฐ ๋ฐํ
|
88 |
+
return self._create_default_memory_context(user_id, room_id, session_id)
|
89 |
+
|
90 |
+
def add_document_to_room(self, room_id: str, document_info: Dict[str, Any]) -> bool:
|
91 |
+
"""์ฑํ
๋ฐฉ์ ๋ฌธ์ ์ถ๊ฐ"""
|
92 |
+
try:
|
93 |
+
# ์ฑํ
๋ฐฉ ์ปจํ
์คํธ์ ๋ฌธ์ ์ถ๊ฐ
|
94 |
+
success = self.room_context_manager.add_document(room_id, document_info)
|
95 |
+
|
96 |
+
if success:
|
97 |
+
# ์ฌ์ฉ์ ํต๊ณ ์
๋ฐ์ดํธ
|
98 |
+
user_id = document_info.get("uploaded_by", "unknown")
|
99 |
+
if user_id != "unknown":
|
100 |
+
self.user_memory_manager.record_conversation(
|
101 |
+
user_id,
|
102 |
+
topic=f"๋ฌธ์ ์
๋ก๋: {document_info['filename']}"
|
103 |
+
)
|
104 |
+
|
105 |
+
logger.info(f"โ
๋ฌธ์ ์ถ๊ฐ ์๋ฃ: {room_id} - {document_info['filename']}")
|
106 |
+
|
107 |
+
return success
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
logger.error(f"โ ๋ฌธ์ ์ถ๊ฐ ์คํจ: {room_id} - {e}")
|
111 |
+
return False
|
112 |
+
|
113 |
+
def update_user_preferences(self, user_id: str, preferences: Dict[str, Any]) -> bool:
|
114 |
+
"""์ฌ์ฉ์ ์ ํธ๋ ์
๋ฐ์ดํธ"""
|
115 |
+
try:
|
116 |
+
success = self.user_memory_manager.update_preferences(user_id, preferences)
|
117 |
+
|
118 |
+
if success:
|
119 |
+
logger.info(f"โ
์ฌ์ฉ์ ์ ํธ๋ ์
๋ฐ์ดํธ: {user_id} - {len(preferences)}๊ฐ ํญ๋ชฉ")
|
120 |
+
|
121 |
+
return success
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
logger.error(f"โ ์ฌ์ฉ์ ์ ํธ๋ ์
๋ฐ์ดํธ ์คํจ: {user_id} - {e}")
|
125 |
+
return False
|
126 |
+
|
127 |
+
def add_important_info(self, user_id: str, info: str) -> bool:
|
128 |
+
"""์ฌ์ฉ์ ์ค์ ์ ๋ณด ์ถ๊ฐ"""
|
129 |
+
try:
|
130 |
+
success = self.user_memory_manager.add_important_info(user_id, info)
|
131 |
+
|
132 |
+
if success:
|
133 |
+
logger.info(f"โ
์ค์ ์ ๋ณด ์ถ๊ฐ: {user_id} - {info[:50]}...")
|
134 |
+
|
135 |
+
return success
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
logger.error(f"โ ์ค์ ์ ๋ณด ์ถ๊ฐ ์คํจ: {user_id} - {e}")
|
139 |
+
return False
|
140 |
+
|
141 |
+
def record_conversation(self, user_id: str, room_id: str, topic: str = None) -> bool:
|
142 |
+
"""๋ํ ๊ธฐ๋ก (ํต๊ณ ์
๋ฐ์ดํธ)"""
|
143 |
+
try:
|
144 |
+
# ์ฌ์ฉ์ ํต๊ณ ์
๋ฐ์ดํธ
|
145 |
+
user_success = self.user_memory_manager.record_conversation(user_id, topic)
|
146 |
+
|
147 |
+
# ์ฑํ
๋ฐฉ ํต๊ณ ์
๋ฐ์ดํธ
|
148 |
+
room_success = self.room_context_manager.increment_message_count(room_id)
|
149 |
+
|
150 |
+
# ์ฐธ๊ฐ์ ์ถ๊ฐ
|
151 |
+
self.room_context_manager.add_participant(room_id, user_id)
|
152 |
+
|
153 |
+
return user_success and room_success
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
logger.error(f"โ ๋ํ ๊ธฐ๋ก ์คํจ: {user_id}/{room_id} - {e}")
|
157 |
+
return False
|
158 |
+
|
159 |
+
def get_context_for_ai(self, user_id: str, room_id: str, session_id: str,
|
160 |
+
include_user_memory: bool = True,
|
161 |
+
include_room_context: bool = True,
|
162 |
+
include_session_history: bool = True) -> str:
|
163 |
+
"""AI ์๋ต ์์ฑ์ ์ํ ํตํฉ ์ปจํ
์คํธ ์์ฑ"""
|
164 |
+
try:
|
165 |
+
context_parts = []
|
166 |
+
|
167 |
+
# 1. ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ (์ ์ญ ์ฅ๊ธฐ ๊ธฐ์ต)
|
168 |
+
if include_user_memory:
|
169 |
+
user_memory = self.user_memory_manager.get_user_memory(user_id)
|
170 |
+
user_context = self._format_user_memory_for_ai(user_memory)
|
171 |
+
if user_context:
|
172 |
+
context_parts.append(f"=== ์ฌ์ฉ์ ์ ๋ณด ===\n{user_context}")
|
173 |
+
|
174 |
+
# 2. ์ฑํ
๋ฐฉ ์ปจํ
์คํธ (์ค๊ธฐ ๊ธฐ์ต)
|
175 |
+
if include_room_context:
|
176 |
+
room_context = self.room_context_manager.get_room_context(room_id)
|
177 |
+
room_context_str = self._format_room_context_for_ai(room_context)
|
178 |
+
if room_context_str:
|
179 |
+
context_parts.append(f"=== ์ฑํ
๋ฐฉ ์ ๋ณด ===\n{room_context_str}")
|
180 |
+
|
181 |
+
# 3. ์ธ์
ํ์คํ ๋ฆฌ (์ด๋จ๊ธฐ ๊ธฐ์ต)
|
182 |
+
if include_session_history and session_id in self.session_context_managers:
|
183 |
+
session_manager = self.session_context_managers[session_id]
|
184 |
+
session_context = session_manager.get_context(include_system=True, max_length=1000)
|
185 |
+
if session_context:
|
186 |
+
context_parts.append(f"=== ํ์ฌ ๋ํ ===\n{session_context}")
|
187 |
+
|
188 |
+
# ํตํฉ ์ปจํ
์คํธ ์์ฑ
|
189 |
+
combined_context = "\n\n".join(context_parts)
|
190 |
+
|
191 |
+
logger.debug(f"๐ AI์ฉ ํตํฉ ์ปจํ
์คํธ ์์ฑ: {user_id}/{room_id}/{session_id} - {len(combined_context)} ๋ฌธ์")
|
192 |
+
return combined_context
|
193 |
+
|
194 |
+
except Exception as e:
|
195 |
+
logger.error(f"โ AI์ฉ ์ปจํ
์คํธ ์์ฑ ์คํจ: {user_id}/{room_id}/{session_id} - {e}")
|
196 |
+
return ""
|
197 |
+
|
198 |
+
def _create_combined_context(self, user_memory: UserMemory,
|
199 |
+
room_context: RoomContext,
|
200 |
+
session_id: str) -> str:
|
201 |
+
"""ํตํฉ ์ปจํ
์คํธ ์์ฑ"""
|
202 |
+
try:
|
203 |
+
context_parts = []
|
204 |
+
|
205 |
+
# ์ฌ์ฉ์ ๊ธฐ๋ณธ ์ ๋ณด
|
206 |
+
if user_memory.name:
|
207 |
+
context_parts.append(f"์ฌ์ฉ์: {user_memory.name}")
|
208 |
+
|
209 |
+
# ์ฌ์ฉ์ ์ ํธ๋
|
210 |
+
if user_memory.preferences:
|
211 |
+
prefs = ", ".join([f"{k}: {v}" for k, v in user_memory.preferences.items()])
|
212 |
+
context_parts.append(f"์ ํธ๋: {prefs}")
|
213 |
+
|
214 |
+
# ์ฑํ
๋ฐฉ ์ ๋ณด
|
215 |
+
if room_context.room_name:
|
216 |
+
context_parts.append(f"์ฑํ
๋ฐฉ: {room_context.room_name}")
|
217 |
+
|
218 |
+
# ๋ฌธ์ ์ ๋ณด
|
219 |
+
if room_context.documents:
|
220 |
+
doc_names = []
|
221 |
+
for d in room_context.documents[-3:]:
|
222 |
+
if isinstance(d, dict):
|
223 |
+
filename = d.get('filename', 'unknown')
|
224 |
+
else:
|
225 |
+
filename = getattr(d, 'filename', 'unknown')
|
226 |
+
doc_names.append(filename)
|
227 |
+
|
228 |
+
doc_info = f"๋ฌธ์ {len(room_context.documents)}๊ฐ: " + ", ".join(doc_names)
|
229 |
+
context_parts.append(doc_info)
|
230 |
+
|
231 |
+
return "\n".join(context_parts)
|
232 |
+
|
233 |
+
except Exception as e:
|
234 |
+
logger.error(f"โ ํตํฉ ์ปจํ
์คํธ ์์ฑ ์คํจ: {e}")
|
235 |
+
return ""
|
236 |
+
|
237 |
+
def _create_memory_summary(self, user_memory: UserMemory,
|
238 |
+
room_context: RoomContext) -> str:
|
239 |
+
"""๋ฉ๋ชจ๋ฆฌ ์์ฝ ์์ฑ"""
|
240 |
+
try:
|
241 |
+
summary_parts = []
|
242 |
+
|
243 |
+
# ์ฌ์ฉ์ ์์ฝ
|
244 |
+
if user_memory.important_info:
|
245 |
+
summary_parts.append(f"์ฌ์ฉ์ ์ค์ ์ ๋ณด: {len(user_memory.important_info)}๊ฐ")
|
246 |
+
|
247 |
+
if user_memory.expertise_areas:
|
248 |
+
summary_parts.append(f"์ ๋ฌธ ๋ถ์ผ: {', '.join(user_memory.expertise_areas)}")
|
249 |
+
|
250 |
+
# ์ฑํ
๋ฐฉ ์์ฝ
|
251 |
+
if room_context.conversation_summary:
|
252 |
+
summary_parts.append(f"๋ํ ์์ฝ: {room_context.conversation_summary[:100]}...")
|
253 |
+
|
254 |
+
if room_context.key_topics:
|
255 |
+
summary_parts.append(f"์ฃผ์ ์ฃผ์ : {', '.join(room_context.key_topics)}")
|
256 |
+
|
257 |
+
return " | ".join(summary_parts)
|
258 |
+
|
259 |
+
except Exception as e:
|
260 |
+
logger.error(f"โ ๋ฉ๋ชจ๋ฆฌ ์์ฝ ์์ฑ ์คํจ: {e}")
|
261 |
+
return ""
|
262 |
+
|
263 |
+
def _format_user_memory_for_ai(self, user_memory: UserMemory) -> str:
|
264 |
+
"""AI์ฉ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ํฌ๋งทํ
"""
|
265 |
+
try:
|
266 |
+
parts = []
|
267 |
+
|
268 |
+
if user_memory.name:
|
269 |
+
parts.append(f"์ด๋ฆ: {user_memory.name}")
|
270 |
+
|
271 |
+
if user_memory.important_info:
|
272 |
+
parts.append(f"์ค์ ์ ๋ณด: {', '.join(user_memory.important_info)}")
|
273 |
+
|
274 |
+
if user_memory.expertise_areas:
|
275 |
+
parts.append(f"์ ๋ฌธ ๋ถ์ผ: {', '.join(user_memory.expertise_areas)}")
|
276 |
+
|
277 |
+
if user_memory.interests:
|
278 |
+
parts.append(f"๊ด์ฌ์ฌ: {', '.join(user_memory.interests)}")
|
279 |
+
|
280 |
+
if user_memory.communication_style:
|
281 |
+
parts.append(f"๋ํ ์คํ์ผ: {user_memory.communication_style}")
|
282 |
+
|
283 |
+
if user_memory.ai_personality:
|
284 |
+
parts.append(f"AI ์ฑ๊ฒฉ: {user_memory.ai_personality}")
|
285 |
+
|
286 |
+
return "\n".join(parts)
|
287 |
+
|
288 |
+
except Exception as e:
|
289 |
+
logger.error(f"โ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ํฌ๋งทํ
์คํจ: {e}")
|
290 |
+
return ""
|
291 |
+
|
292 |
+
def _format_room_context_for_ai(self, room_context: RoomContext) -> str:
|
293 |
+
"""AI์ฉ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ํฌ๋งทํ
"""
|
294 |
+
try:
|
295 |
+
parts = []
|
296 |
+
|
297 |
+
if room_context.room_name:
|
298 |
+
parts.append(f"์ฑํ
๋ฐฉ: {room_context.room_name}")
|
299 |
+
|
300 |
+
if room_context.description:
|
301 |
+
parts.append(f"์ค๋ช
: {room_context.description}")
|
302 |
+
|
303 |
+
if room_context.documents:
|
304 |
+
parts.append(f"์
๋ก๋๋ ๋ฌธ์: {len(room_context.documents)}๊ฐ")
|
305 |
+
for doc in room_context.documents[-3:]: # ์ต๊ทผ 3๊ฐ๋ง
|
306 |
+
# ๋์
๋๋ฆฌ์ ๊ฐ์ฒด ๋ชจ๋ ์ฒ๋ฆฌ
|
307 |
+
if isinstance(doc, dict):
|
308 |
+
filename = doc.get('filename', 'unknown')
|
309 |
+
doc_type = doc.get('document_type', 'unknown')
|
310 |
+
page_count = doc.get('page_count', 0)
|
311 |
+
else:
|
312 |
+
filename = getattr(doc, 'filename', 'unknown')
|
313 |
+
doc_type = getattr(doc, 'document_type', 'unknown')
|
314 |
+
page_count = getattr(doc, 'page_count', 0)
|
315 |
+
|
316 |
+
parts.append(f" - {filename} ({doc_type}, {page_count}ํ์ด์ง)")
|
317 |
+
|
318 |
+
if room_context.conversation_summary:
|
319 |
+
parts.append(f"๋ํ ์์ฝ: {room_context.conversation_summary}")
|
320 |
+
|
321 |
+
if room_context.key_topics:
|
322 |
+
parts.append(f"์ฃผ์ ์ฃผ์ : {', '.join(room_context.key_topics)}")
|
323 |
+
|
324 |
+
return "\n".join(parts)
|
325 |
+
|
326 |
+
except Exception as e:
|
327 |
+
logger.error(f"โ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ํฌ๋งทํ
์คํจ: {e}")
|
328 |
+
return ""
|
329 |
+
|
330 |
+
def _create_default_memory_context(self, user_id: str, room_id: str, session_id: str) -> MemoryContext:
|
331 |
+
"""๊ธฐ๋ณธ ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ์์ฑ (์ค๋ฅ ์)"""
|
332 |
+
try:
|
333 |
+
user_memory = self.user_memory_manager.get_user_memory(user_id)
|
334 |
+
room_context = self.room_context_manager.get_room_context(room_id)
|
335 |
+
|
336 |
+
return MemoryContext(
|
337 |
+
user_id=user_id,
|
338 |
+
user_memory=user_memory,
|
339 |
+
room_id=room_id,
|
340 |
+
room_context=room_context,
|
341 |
+
session_id=session_id,
|
342 |
+
combined_context="",
|
343 |
+
memory_summary=""
|
344 |
+
)
|
345 |
+
|
346 |
+
except Exception as e:
|
347 |
+
logger.error(f"โ ๊ธฐ๋ณธ ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ์์ฑ ์คํจ: {e}")
|
348 |
+
# ์ต์ํ์ ๊ธฐ๋ณธ๊ฐ ๋ฐํ
|
349 |
+
return MemoryContext(
|
350 |
+
user_id=user_id,
|
351 |
+
user_memory=None,
|
352 |
+
room_id=room_id,
|
353 |
+
room_context=None,
|
354 |
+
session_id=session_id,
|
355 |
+
combined_context="",
|
356 |
+
memory_summary=""
|
357 |
+
)
|
358 |
+
|
359 |
+
def get_session_context_manager(self, session_id: str) -> AdvancedContextManager:
|
360 |
+
"""์ธ์
๋ณ ์ปจํ
์คํธ ๊ด๋ฆฌ์ ๋ฐํ"""
|
361 |
+
if session_id not in self.session_context_managers:
|
362 |
+
self.session_context_managers[session_id] = AdvancedContextManager(
|
363 |
+
max_tokens=2000,
|
364 |
+
max_turns=20,
|
365 |
+
strategy="sliding_window"
|
366 |
+
)
|
367 |
+
|
368 |
+
return self.session_context_managers[session_id]
|
369 |
+
|
370 |
+
def cleanup_old_sessions(self, max_age_hours: int = 24):
|
371 |
+
"""์ค๋๋ ์ธ์
์ ๋ฆฌ"""
|
372 |
+
try:
|
373 |
+
current_time = time.time()
|
374 |
+
max_age_seconds = max_age_hours * 3600
|
375 |
+
|
376 |
+
sessions_to_remove = []
|
377 |
+
|
378 |
+
for session_id, manager in self.session_context_managers.items():
|
379 |
+
# ์ธ์
ID์์ ํ์์คํฌํ ์ถ์ถ ์๋
|
380 |
+
try:
|
381 |
+
# session_id ํ์: room_{room_id}_user_{user_id}_{timestamp}
|
382 |
+
if "_" in session_id:
|
383 |
+
timestamp_str = session_id.split("_")[-1]
|
384 |
+
timestamp = int(timestamp_str)
|
385 |
+
|
386 |
+
if current_time - timestamp > max_age_seconds:
|
387 |
+
sessions_to_remove.append(session_id)
|
388 |
+
|
389 |
+
except (ValueError, IndexError):
|
390 |
+
# ํ์์คํฌํ ์ถ์ถ ์คํจ ์ ๊ฑด๋๋ฐ๊ธฐ
|
391 |
+
continue
|
392 |
+
|
393 |
+
# ์ค๋๋ ์ธ์
์ ๊ฑฐ
|
394 |
+
for session_id in sessions_to_remove:
|
395 |
+
del self.session_context_managers[session_id]
|
396 |
+
logger.info(f"๐๏ธ ์ค๋๋ ์ธ์
์ ๋ฆฌ: {session_id}")
|
397 |
+
|
398 |
+
logger.info(f"โ
์ธ์
์ ๋ฆฌ ์๋ฃ: {len(sessions_to_remove)}๊ฐ ์ ๊ฑฐ")
|
399 |
+
|
400 |
+
except Exception as e:
|
401 |
+
logger.error(f"โ ์ธ์
์ ๋ฆฌ ์คํจ: {e}")
|
402 |
+
|
403 |
+
# ์ ์ญ ์ธ์คํด์ค
|
404 |
+
integrated_memory_manager = IntegratedMemoryManager()
|
lily_llm_core/room_context_manager.py
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
์ฑํ
๋ฐฉ๋ณ ์ปจํ
์คํธ ๊ด๋ฆฌ์ (Room Context Manager)
|
4 |
+
์ฑํ
๋ฐฉ๋ณ๋ก ๋
๋ฆฝ์ ์ธ ๋ํ ์ปจํ
์คํธ์ ๋ฌธ์ ํ์คํ ๋ฆฌ๋ฅผ ๊ด๋ฆฌ
|
5 |
+
"""
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import time
|
9 |
+
import json
|
10 |
+
import os
|
11 |
+
from typing import Dict, Any, List, Optional, Tuple
|
12 |
+
from dataclasses import dataclass, asdict
|
13 |
+
from pathlib import Path
|
14 |
+
from collections import deque
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
@dataclass
|
19 |
+
class RoomDocument:
|
20 |
+
"""์ฑํ
๋ฐฉ์ ์
๋ก๋๋ ๋ฌธ์ ์ ๋ณด"""
|
21 |
+
document_id: str
|
22 |
+
filename: str
|
23 |
+
uploaded_at: float
|
24 |
+
uploaded_by: str
|
25 |
+
document_type: str
|
26 |
+
page_count: int
|
27 |
+
chunk_count: int
|
28 |
+
summary: Optional[str] = None
|
29 |
+
tags: List[str] = None
|
30 |
+
|
31 |
+
def __post_init__(self):
|
32 |
+
if self.tags is None:
|
33 |
+
self.tags = []
|
34 |
+
|
35 |
+
@dataclass
|
36 |
+
class RoomContext:
|
37 |
+
"""์ฑํ
๋ฐฉ๋ณ ์ปจํ
์คํธ ๋ฐ์ดํฐ"""
|
38 |
+
room_id: str
|
39 |
+
created_at: float
|
40 |
+
last_updated: float
|
41 |
+
|
42 |
+
# ๊ธฐ๋ณธ ์ ๋ณด
|
43 |
+
room_name: Optional[str] = None
|
44 |
+
description: Optional[str] = None
|
45 |
+
participants: List[str] = None
|
46 |
+
|
47 |
+
# ๋ฌธ์ ํ์คํ ๋ฆฌ
|
48 |
+
documents: List[RoomDocument] = None
|
49 |
+
|
50 |
+
# ๋ํ ์ปจํ
์คํธ
|
51 |
+
conversation_summary: Optional[str] = None
|
52 |
+
key_topics: List[str] = None
|
53 |
+
total_messages: int = 0
|
54 |
+
|
55 |
+
# AI ์ค์
|
56 |
+
ai_context: Dict[str, Any] = None
|
57 |
+
|
58 |
+
def __post_init__(self):
|
59 |
+
if self.participants is None:
|
60 |
+
self.participants = []
|
61 |
+
if self.documents is None:
|
62 |
+
self.documents = []
|
63 |
+
if self.key_topics is None:
|
64 |
+
self.key_topics = []
|
65 |
+
if self.ai_context is None:
|
66 |
+
self.ai_context = {}
|
67 |
+
|
68 |
+
class RoomContextManager:
|
69 |
+
"""์ฑํ
๋ฐฉ๋ณ ์ปจํ
์คํธ ๊ด๋ฆฌ์"""
|
70 |
+
|
71 |
+
def __init__(self, storage_dir: str = "room_contexts"):
|
72 |
+
self.storage_dir = Path(storage_dir)
|
73 |
+
self.storage_dir.mkdir(exist_ok=True)
|
74 |
+
|
75 |
+
# ์ปจํ
์คํธ ์บ์
|
76 |
+
self.room_cache: Dict[str, RoomContext] = {}
|
77 |
+
self.cache_size_limit = 50 # ์ต๋ 50๊ฐ ์ฑํ
๋ฐฉ ์บ์
|
78 |
+
|
79 |
+
# ํต๊ณ
|
80 |
+
self.total_rooms = 0
|
81 |
+
self.total_documents = 0
|
82 |
+
|
83 |
+
logger.info(f"๐ RoomContextManager ์ด๊ธฐํ: {self.storage_dir}")
|
84 |
+
|
85 |
+
def get_room_context(self, room_id: str) -> RoomContext:
|
86 |
+
"""์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์กฐํ (์บ์ ์ฐ์ )"""
|
87 |
+
# ์บ์์์ ๋จผ์ ํ์ธ
|
88 |
+
if room_id in self.room_cache:
|
89 |
+
logger.debug(f"๐ ์บ์์์ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์กฐํ: {room_id}")
|
90 |
+
return self.room_cache[room_id]
|
91 |
+
|
92 |
+
# ํ์ผ์์ ๋ก๋
|
93 |
+
context = self._load_context_from_file(room_id)
|
94 |
+
if context:
|
95 |
+
# ์บ์์ ์ถ๊ฐ
|
96 |
+
self._add_to_cache(room_id, context)
|
97 |
+
return context
|
98 |
+
|
99 |
+
# ์ ์ปจํ
์คํธ ์์ฑ
|
100 |
+
context = self._create_new_context(room_id)
|
101 |
+
self._add_to_cache(room_id, context)
|
102 |
+
return context
|
103 |
+
|
104 |
+
def add_document(self, room_id: str, document_info: Dict[str, Any]) -> bool:
|
105 |
+
"""๋ฌธ์ ์ถ๊ฐ"""
|
106 |
+
try:
|
107 |
+
context = self.get_room_context(room_id)
|
108 |
+
|
109 |
+
# ์ ๋ฌธ์ ์์ฑ
|
110 |
+
document = RoomDocument(
|
111 |
+
document_id=document_info["document_id"],
|
112 |
+
filename=document_info["filename"],
|
113 |
+
uploaded_at=time.time(),
|
114 |
+
uploaded_by=document_info.get("uploaded_by", "unknown"),
|
115 |
+
document_type=document_info.get("document_type", "unknown"),
|
116 |
+
page_count=document_info.get("page_count", 0),
|
117 |
+
chunk_count=document_info.get("chunk_count", 0),
|
118 |
+
summary=document_info.get("summary"),
|
119 |
+
tags=document_info.get("tags", [])
|
120 |
+
)
|
121 |
+
|
122 |
+
# ๋ฌธ์ ์ถ๊ฐ
|
123 |
+
context.documents.append(document)
|
124 |
+
context.total_documents = len(context.documents)
|
125 |
+
context.last_updated = time.time()
|
126 |
+
|
127 |
+
# ํ์ผ์ ์ ์ฅ
|
128 |
+
self._save_context_to_file(context)
|
129 |
+
|
130 |
+
logger.info(f"๐ ๋ฌธ์ ์ถ๊ฐ ์๋ฃ: {room_id} - {document.filename}")
|
131 |
+
return True
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
logger.error(f"โ ๋ฌธ์ ์ถ๊ฐ ์คํจ: {room_id} - {e}")
|
135 |
+
return False
|
136 |
+
|
137 |
+
def get_documents(self, room_id: str) -> List[RoomDocument]:
|
138 |
+
"""์ฑํ
๋ฐฉ์ ๋ชจ๋ ๋ฌธ์ ๋ฐํ"""
|
139 |
+
try:
|
140 |
+
context = self.get_room_context(room_id)
|
141 |
+
return context.documents
|
142 |
+
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"โ ๋ฌธ์ ๋ชฉ๋ก ์กฐํ ์คํจ: {room_id} - {e}")
|
145 |
+
return []
|
146 |
+
|
147 |
+
def find_document(self, room_id: str, document_id: str) -> Optional[RoomDocument]:
|
148 |
+
"""ํน์ ๋ฌธ์ ์ฐพ๊ธฐ"""
|
149 |
+
try:
|
150 |
+
context = self.get_room_context(room_id)
|
151 |
+
|
152 |
+
for doc in context.documents:
|
153 |
+
if doc.document_id == document_id:
|
154 |
+
return doc
|
155 |
+
|
156 |
+
return None
|
157 |
+
|
158 |
+
except Exception as e:
|
159 |
+
logger.error(f"โ ๋ฌธ์ ๊ฒ์ ์คํจ: {room_id} - {document_id} - {e}")
|
160 |
+
return None
|
161 |
+
|
162 |
+
def update_conversation_summary(self, room_id: str, summary: str, topics: List[str] = None) -> bool:
|
163 |
+
"""๋ํ ์์ฝ ์
๋ฐ์ดํธ"""
|
164 |
+
try:
|
165 |
+
context = self.get_room_context(room_id)
|
166 |
+
|
167 |
+
context.conversation_summary = summary
|
168 |
+
if topics:
|
169 |
+
context.key_topics = topics
|
170 |
+
|
171 |
+
context.last_updated = time.time()
|
172 |
+
|
173 |
+
self._save_context_to_file(context)
|
174 |
+
logger.info(f"๐ ๋ํ ์์ฝ ์
๋ฐ์ดํธ: {room_id} - {len(summary)} ๋ฌธ์")
|
175 |
+
return True
|
176 |
+
|
177 |
+
except Exception as e:
|
178 |
+
logger.error(f"โ ๋ํ ์์ฝ ์
๋ฐ์ดํธ ์คํจ: {room_id} - {e}")
|
179 |
+
return False
|
180 |
+
|
181 |
+
def increment_message_count(self, room_id: str) -> bool:
|
182 |
+
"""๋ฉ์์ง ์ ์ฆ๊ฐ"""
|
183 |
+
try:
|
184 |
+
context = self.get_room_context(room_id)
|
185 |
+
context.total_messages += 1
|
186 |
+
context.last_updated = time.time()
|
187 |
+
|
188 |
+
self._save_context_to_file(context)
|
189 |
+
return True
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
logger.error(f"โ ๋ฉ์์ง ์ ์ฆ๊ฐ ์คํจ: {room_id} - {e}")
|
193 |
+
return False
|
194 |
+
|
195 |
+
def add_participant(self, room_id: str, user_id: str) -> bool:
|
196 |
+
"""์ฐธ๊ฐ์ ์ถ๊ฐ"""
|
197 |
+
try:
|
198 |
+
context = self.get_room_context(room_id)
|
199 |
+
|
200 |
+
if user_id not in context.participants:
|
201 |
+
context.participants.append(user_id)
|
202 |
+
context.last_updated = time.time()
|
203 |
+
|
204 |
+
self._save_context_to_file(context)
|
205 |
+
logger.info(f"๐ฅ ์ฐธ๊ฐ์ ์ถ๊ฐ: {room_id} - {user_id}")
|
206 |
+
return True
|
207 |
+
|
208 |
+
return False
|
209 |
+
|
210 |
+
except Exception as e:
|
211 |
+
logger.error(f"โ ์ฐธ๊ฐ์ ์ถ๊ฐ ์คํจ: {room_id} - {user_id} - {e}")
|
212 |
+
return False
|
213 |
+
|
214 |
+
def set_room_info(self, room_id: str, name: str = None, description: str = None) -> bool:
|
215 |
+
"""์ฑํ
๋ฐฉ ์ ๋ณด ์ค์ """
|
216 |
+
try:
|
217 |
+
context = self.get_room_context(room_id)
|
218 |
+
|
219 |
+
if name:
|
220 |
+
context.room_name = name
|
221 |
+
if description:
|
222 |
+
context.description = description
|
223 |
+
|
224 |
+
context.last_updated = time.time()
|
225 |
+
|
226 |
+
self._save_context_to_file(context)
|
227 |
+
logger.info(f"๐ท๏ธ ์ฑํ
๋ฐฉ ์ ๋ณด ์ค์ : {room_id} - {name}")
|
228 |
+
return True
|
229 |
+
|
230 |
+
except Exception as e:
|
231 |
+
logger.error(f"โ ์ฑํ
๋ฐฉ ์ ๋ณด ์ค์ ์คํจ: {room_id} - {e}")
|
232 |
+
return False
|
233 |
+
|
234 |
+
def get_context_summary(self, room_id: str) -> Dict[str, Any]:
|
235 |
+
"""์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์์ฝ ๋ฐํ"""
|
236 |
+
try:
|
237 |
+
context = self.get_room_context(room_id)
|
238 |
+
|
239 |
+
return {
|
240 |
+
"room_id": context.room_id,
|
241 |
+
"room_name": context.room_name,
|
242 |
+
"description": context.description,
|
243 |
+
"participants": context.participants,
|
244 |
+
"total_documents": len(context.documents),
|
245 |
+
"total_messages": context.total_messages,
|
246 |
+
"conversation_summary": context.conversation_summary,
|
247 |
+
"key_topics": context.key_topics,
|
248 |
+
"last_updated": context.last_updated,
|
249 |
+
"documents": [
|
250 |
+
{
|
251 |
+
"document_id": doc.document_id if hasattr(doc, 'document_id') else doc.get('document_id', 'unknown'),
|
252 |
+
"filename": doc.filename if hasattr(doc, 'filename') else doc.get('filename', 'unknown'),
|
253 |
+
"uploaded_at": doc.uploaded_at if hasattr(doc, 'uploaded_at') else doc.get('uploaded_at', 0),
|
254 |
+
"document_type": doc.document_type if hasattr(doc, 'document_type') else doc.get('document_type', 'unknown'),
|
255 |
+
"page_count": doc.page_count if hasattr(doc, 'page_count') else doc.get('page_count', 0),
|
256 |
+
"chunk_count": doc.chunk_count if hasattr(doc, 'chunk_count') else doc.get('chunk_count', 0),
|
257 |
+
"summary": doc.summary if hasattr(doc, 'summary') else doc.get('summary', '')
|
258 |
+
}
|
259 |
+
for doc in context.documents[-5:] # ์ต๊ทผ 5๊ฐ๋ง
|
260 |
+
]
|
261 |
+
}
|
262 |
+
|
263 |
+
except Exception as e:
|
264 |
+
logger.error(f"โ ์ปจํ
์คํธ ์์ฝ ์์ฑ ์คํจ: {room_id} - {e}")
|
265 |
+
return {}
|
266 |
+
|
267 |
+
def _create_new_context(self, room_id: str) -> RoomContext:
|
268 |
+
"""์ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์์ฑ"""
|
269 |
+
context = RoomContext(
|
270 |
+
room_id=room_id,
|
271 |
+
created_at=time.time(),
|
272 |
+
last_updated=time.time()
|
273 |
+
)
|
274 |
+
|
275 |
+
# ๊ธฐ๋ณธ ์ค์
|
276 |
+
context.room_name = f"์ฑํ
๋ฐฉ {room_id}"
|
277 |
+
context.description = "์๋ก ์์ฑ๋ ์ฑํ
๋ฐฉ์
๋๋ค."
|
278 |
+
|
279 |
+
# ํ์ผ์ ์ ์ฅ
|
280 |
+
self._save_context_to_file(context)
|
281 |
+
|
282 |
+
logger.info(f"๐ ์ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์์ฑ: {room_id}")
|
283 |
+
return context
|
284 |
+
|
285 |
+
def _save_context_to_file(self, context: RoomContext) -> bool:
|
286 |
+
"""์ปจํ
์คํธ๋ฅผ ํ์ผ์ ์ ์ฅ"""
|
287 |
+
try:
|
288 |
+
file_path = self.storage_dir / f"{context.room_id}.json"
|
289 |
+
|
290 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
291 |
+
json.dump(asdict(context), f, ensure_ascii=False, indent=2)
|
292 |
+
|
293 |
+
return True
|
294 |
+
|
295 |
+
except Exception as e:
|
296 |
+
logger.error(f"โ ์ปจํ
์คํธ ํ์ผ ์ ์ฅ ์คํจ: {context.room_id} - {e}")
|
297 |
+
return False
|
298 |
+
|
299 |
+
def _load_context_from_file(self, room_id: str) -> Optional[RoomContext]:
|
300 |
+
"""ํ์ผ์์ ์ปจํ
์คํธ ๋ก๋"""
|
301 |
+
try:
|
302 |
+
file_path = self.storage_dir / f"{room_id}.json"
|
303 |
+
|
304 |
+
if not file_path.exists():
|
305 |
+
return None
|
306 |
+
|
307 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
308 |
+
data = json.load(f)
|
309 |
+
|
310 |
+
# documents ๋ฆฌ์คํธ์ ๊ฐ ํญ๋ชฉ์ RoomDocument ๊ฐ์ฒด๋ก ๋ณํ
|
311 |
+
if 'documents' in data and isinstance(data['documents'], list):
|
312 |
+
documents = []
|
313 |
+
for doc_data in data['documents']:
|
314 |
+
if isinstance(doc_data, dict):
|
315 |
+
# ๋์
๋๋ฆฌ๋ฅผ RoomDocument ๊ฐ์ฒด๋ก ๋ณํ
|
316 |
+
doc = RoomDocument(**doc_data)
|
317 |
+
documents.append(doc)
|
318 |
+
else:
|
319 |
+
# ์ด๋ฏธ RoomDocument ๊ฐ์ฒด์ธ ๊ฒฝ์ฐ
|
320 |
+
documents.append(doc_data)
|
321 |
+
data['documents'] = documents
|
322 |
+
|
323 |
+
# RoomContext ๊ฐ์ฒด๋ก ๋ณํ
|
324 |
+
context = RoomContext(**data)
|
325 |
+
logger.debug(f"๐ ํ์ผ์์ ์ปจํ
์คํธ ๋ก๋: {room_id}")
|
326 |
+
return context
|
327 |
+
|
328 |
+
except Exception as e:
|
329 |
+
logger.error(f"โ ์ปจํ
์คํธ ํ์ผ ๋ก๋ ์คํจ: {room_id} - {e}")
|
330 |
+
return None
|
331 |
+
|
332 |
+
def _add_to_cache(self, room_id: str, context: RoomContext):
|
333 |
+
"""์บ์์ ์ปจํ
์คํธ ์ถ๊ฐ (ํฌ๊ธฐ ์ ํ ํ์ธ)"""
|
334 |
+
if len(self.room_cache) >= self.cache_size_limit:
|
335 |
+
# ๊ฐ์ฅ ์ค๋๋ ์ปจํ
์คํธ ์ ๊ฑฐ (LRU ๋ฐฉ์)
|
336 |
+
oldest_room = min(self.room_cache.keys(),
|
337 |
+
key=lambda k: self.room_cache[k].last_updated)
|
338 |
+
del self.room_cache[oldest_room]
|
339 |
+
logger.debug(f"๐๏ธ ์บ์์์ ์ค๋๋ ์ปจํ
์คํธ ์ ๊ฑฐ: {oldest_room}")
|
340 |
+
|
341 |
+
self.room_cache[room_id] = context
|
342 |
+
|
343 |
+
def get_all_rooms(self) -> List[str]:
|
344 |
+
"""๋ชจ๋ ์ฑํ
๋ฐฉ ID ๋ฐํ"""
|
345 |
+
try:
|
346 |
+
room_files = list(self.storage_dir.glob("*.json"))
|
347 |
+
room_ids = [f.stem for f in room_files]
|
348 |
+
return room_ids
|
349 |
+
|
350 |
+
except Exception as e:
|
351 |
+
logger.error(f"โ ์ฑํ
๋ฐฉ ๋ชฉ๋ก ์กฐํ ์คํจ: {e}")
|
352 |
+
return []
|
353 |
+
|
354 |
+
def delete_room_context(self, room_id: str) -> bool:
|
355 |
+
"""์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์ญ์ """
|
356 |
+
try:
|
357 |
+
# ์บ์์์ ์ ๊ฑฐ
|
358 |
+
if room_id in self.room_cache:
|
359 |
+
del self.room_cache[room_id]
|
360 |
+
|
361 |
+
# ํ์ผ ์ญ์
|
362 |
+
file_path = self.storage_dir / f"{room_id}.json"
|
363 |
+
if file_path.exists():
|
364 |
+
file_path.unlink()
|
365 |
+
|
366 |
+
logger.info(f"๐๏ธ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์ญ์ ์๋ฃ: {room_id}")
|
367 |
+
return True
|
368 |
+
|
369 |
+
except Exception as e:
|
370 |
+
logger.error(f"โ ์ฑํ
๋ฐฉ ์ปจํ
์คํธ ์ญ์ ์คํจ: {room_id} - {e}")
|
371 |
+
return False
|
372 |
+
|
373 |
+
# ์ ์ญ ์ธ์คํด์ค
|
374 |
+
room_context_manager = RoomContextManager()
|
lily_llm_core/user_memory_manager.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
์ฌ์ฉ์๋ณ ์ ์ญ ์ฅ๊ธฐ ๊ธฐ์ต ๊ด๋ฆฌ์ (User Memory Manager)
|
4 |
+
ChatGPT์ '๋ฉ๋ชจ๋ฆฌ' ๊ธฐ๋ฅ๊ณผ ์ ์ฌํ ์ฌ์ฉ์๋ณ ์ ์ญ ์ ๋ณด ์ ์ฅ ์์คํ
|
5 |
+
"""
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import time
|
9 |
+
import json
|
10 |
+
import os
|
11 |
+
from typing import Dict, Any, List, Optional, Tuple
|
12 |
+
from dataclasses import dataclass, asdict
|
13 |
+
from pathlib import Path
|
14 |
+
import hashlib
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
@dataclass
|
19 |
+
class UserMemory:
|
20 |
+
"""์ฌ์ฉ์๋ณ ์ ์ญ ๋ฉ๋ชจ๋ฆฌ ๋ฐ์ดํฐ"""
|
21 |
+
user_id: str
|
22 |
+
created_at: float
|
23 |
+
last_updated: float
|
24 |
+
|
25 |
+
# ๊ธฐ๋ณธ ์ ๋ณด
|
26 |
+
name: Optional[str] = None
|
27 |
+
preferences: Dict[str, Any] = None
|
28 |
+
important_info: List[str] = None
|
29 |
+
|
30 |
+
# ๋ํ ํจํด ๋ฐ ์ ํธ๋
|
31 |
+
communication_style: Optional[str] = None
|
32 |
+
language_preference: Optional[str] = None
|
33 |
+
response_length_preference: Optional[str] = None
|
34 |
+
|
35 |
+
# ์ ๋ฌธ ๋ถ์ผ ๋ฐ ๊ด์ฌ์ฌ
|
36 |
+
expertise_areas: List[str] = None
|
37 |
+
interests: List[str] = None
|
38 |
+
|
39 |
+
# ์ฌ์ฉ ํจํด ํต๊ณ
|
40 |
+
total_conversations: int = 0
|
41 |
+
total_messages: int = 0
|
42 |
+
favorite_topics: List[str] = None
|
43 |
+
|
44 |
+
# AI ์ค์ ๋ฐ ์ ํธ๋
|
45 |
+
ai_personality: Optional[str] = None
|
46 |
+
ai_response_style: Optional[str] = None
|
47 |
+
|
48 |
+
def __post_init__(self):
|
49 |
+
if self.preferences is None:
|
50 |
+
self.preferences = {}
|
51 |
+
if self.important_info is None:
|
52 |
+
self.important_info = []
|
53 |
+
if self.expertise_areas is None:
|
54 |
+
self.expertise_areas = []
|
55 |
+
if self.interests is None:
|
56 |
+
self.interests = []
|
57 |
+
if self.favorite_topics is None:
|
58 |
+
self.favorite_topics = []
|
59 |
+
|
60 |
+
class UserMemoryManager:
|
61 |
+
"""์ฌ์ฉ์๋ณ ์ ์ญ ๋ฉ๋ชจ๋ฆฌ ๊ด๋ฆฌ์"""
|
62 |
+
|
63 |
+
def __init__(self, storage_dir: str = "user_memories"):
|
64 |
+
self.storage_dir = Path(storage_dir)
|
65 |
+
self.storage_dir.mkdir(exist_ok=True)
|
66 |
+
|
67 |
+
# ๋ฉ๋ชจ๋ฆฌ ์บ์ (๋ฉ๋ชจ๋ฆฌ ํจ์จ์ฑ)
|
68 |
+
self.memory_cache: Dict[str, UserMemory] = {}
|
69 |
+
self.cache_size_limit = 100 # ์ต๋ 100๋ช
์ ๋ฉ๋ชจ๋ฆฌ๋ฅผ ์บ์
|
70 |
+
|
71 |
+
# ๋ฉ๋ชจ๋ฆฌ ํต๊ณ
|
72 |
+
self.total_users = 0
|
73 |
+
self.total_memories = 0
|
74 |
+
|
75 |
+
logger.info(f"๐ UserMemoryManager ์ด๊ธฐํ: {self.storage_dir}")
|
76 |
+
|
77 |
+
def get_user_memory(self, user_id: str) -> UserMemory:
|
78 |
+
"""์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์กฐํ (์บ์ ์ฐ์ )"""
|
79 |
+
# ์บ์์์ ๋จผ์ ํ์ธ
|
80 |
+
if user_id in self.memory_cache:
|
81 |
+
logger.debug(f"๐ ์บ์์์ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์กฐํ: {user_id}")
|
82 |
+
return self.memory_cache[user_id]
|
83 |
+
|
84 |
+
# ํ์ผ์์ ๋ก๋
|
85 |
+
memory = self._load_memory_from_file(user_id)
|
86 |
+
if memory:
|
87 |
+
# ์บ์์ ์ถ๊ฐ (์บ์ ํฌ๊ธฐ ์ ํ ํ์ธ)
|
88 |
+
self._add_to_cache(user_id, memory)
|
89 |
+
return memory
|
90 |
+
|
91 |
+
# ์ ๋ฉ๋ชจ๋ฆฌ ์์ฑ
|
92 |
+
memory = self._create_new_memory(user_id)
|
93 |
+
self._add_to_cache(user_id, memory)
|
94 |
+
return memory
|
95 |
+
|
96 |
+
def update_user_memory(self, user_id: str, updates: Dict[str, Any]) -> bool:
|
97 |
+
"""์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์
๋ฐ์ดํธ"""
|
98 |
+
try:
|
99 |
+
memory = self.get_user_memory(user_id)
|
100 |
+
|
101 |
+
# ์
๋ฐ์ดํธ ์ ์ฉ
|
102 |
+
for key, value in updates.items():
|
103 |
+
if hasattr(memory, key):
|
104 |
+
setattr(memory, key, value)
|
105 |
+
logger.debug(f"๐ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์
๋ฐ์ดํธ: {user_id}.{key} = {value}")
|
106 |
+
|
107 |
+
# ํ์์คํฌํ ์
๋ฐ์ดํธ
|
108 |
+
memory.last_updated = time.time()
|
109 |
+
|
110 |
+
# ํ์ผ์ ์ ์ฅ
|
111 |
+
self._save_memory_to_file(memory)
|
112 |
+
|
113 |
+
# ์บ์ ์
๋ฐ์ดํธ
|
114 |
+
if user_id in self.memory_cache:
|
115 |
+
self.memory_cache[user_id] = memory
|
116 |
+
|
117 |
+
logger.info(f"โ
์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์
๋ฐ์ดํธ ์๋ฃ: {user_id}")
|
118 |
+
return True
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"โ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์
๋ฐ์ดํธ ์คํจ: {user_id} - {e}")
|
122 |
+
return False
|
123 |
+
|
124 |
+
def add_important_info(self, user_id: str, info: str) -> bool:
|
125 |
+
"""์ค์ ์ ๋ณด ์ถ๊ฐ"""
|
126 |
+
try:
|
127 |
+
memory = self.get_user_memory(user_id)
|
128 |
+
|
129 |
+
if info not in memory.important_info:
|
130 |
+
memory.important_info.append(info)
|
131 |
+
memory.last_updated = time.time()
|
132 |
+
|
133 |
+
self._save_memory_to_file(memory)
|
134 |
+
logger.info(f"๐ ์ค์ ์ ๋ณด ์ถ๊ฐ: {user_id} - {info[:50]}...")
|
135 |
+
return True
|
136 |
+
|
137 |
+
return False
|
138 |
+
|
139 |
+
except Exception as e:
|
140 |
+
logger.error(f"โ ์ค์ ์ ๋ณด ์ถ๊ฐ ์คํจ: {user_id} - {e}")
|
141 |
+
return False
|
142 |
+
|
143 |
+
def update_preferences(self, user_id: str, preferences: Dict[str, Any]) -> bool:
|
144 |
+
"""์ฌ์ฉ์ ์ ํธ๋ ์
๋ฐ์ดํธ"""
|
145 |
+
try:
|
146 |
+
memory = self.get_user_memory(user_id)
|
147 |
+
|
148 |
+
# ๊ธฐ์กด ์ ํธ๋์ ๋ณํฉ
|
149 |
+
memory.preferences.update(preferences)
|
150 |
+
memory.last_updated = time.time()
|
151 |
+
|
152 |
+
self._save_memory_to_file(memory)
|
153 |
+
logger.info(f"โ๏ธ ์ฌ์ฉ์ ์ ํธ๋ ์
๋ฐ์ดํธ: {user_id} - {len(preferences)}๊ฐ ํญ๋ชฉ")
|
154 |
+
return True
|
155 |
+
|
156 |
+
except Exception as e:
|
157 |
+
logger.error(f"โ ์ฌ์ฉ์ ์ ํธ๋ ์
๋ฐ์ดํธ ์คํจ: {user_id} - {e}")
|
158 |
+
return False
|
159 |
+
|
160 |
+
def record_conversation(self, user_id: str, topic: str = None) -> bool:
|
161 |
+
"""๋ํ ๊ธฐ๋ก (ํต๊ณ ์
๋ฐ์ดํธ)"""
|
162 |
+
try:
|
163 |
+
memory = self.get_user_memory(user_id)
|
164 |
+
|
165 |
+
memory.total_conversations += 1
|
166 |
+
memory.total_messages += 1
|
167 |
+
|
168 |
+
if topic and topic not in memory.favorite_topics:
|
169 |
+
memory.favorite_topics.append(topic)
|
170 |
+
# ์ต๋ 10๊ฐ๊น์ง๋ง ์ ์ง
|
171 |
+
if len(memory.favorite_topics) > 10:
|
172 |
+
memory.favorite_topics = memory.favorite_topics[-10:]
|
173 |
+
|
174 |
+
memory.last_updated = time.time()
|
175 |
+
|
176 |
+
self._save_memory_to_file(memory)
|
177 |
+
return True
|
178 |
+
|
179 |
+
except Exception as e:
|
180 |
+
logger.error(f"โ ๋ํ ๊ธฐ๋ก ์คํจ: {user_id} - {e}")
|
181 |
+
return False
|
182 |
+
|
183 |
+
def get_memory_summary(self, user_id: str) -> Dict[str, Any]:
|
184 |
+
"""์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์์ฝ ๋ฐํ"""
|
185 |
+
try:
|
186 |
+
memory = self.get_user_memory(user_id)
|
187 |
+
|
188 |
+
return {
|
189 |
+
"user_id": memory.user_id,
|
190 |
+
"name": memory.name,
|
191 |
+
"preferences": memory.preferences,
|
192 |
+
"important_info": memory.important_info[:5], # ์ต๊ทผ 5๊ฐ๋ง
|
193 |
+
"expertise_areas": memory.expertise_areas,
|
194 |
+
"interests": memory.interests,
|
195 |
+
"communication_style": memory.communication_style,
|
196 |
+
"ai_personality": memory.ai_personality,
|
197 |
+
"total_conversations": memory.total_conversations,
|
198 |
+
"favorite_topics": memory.favorite_topics[-5:], # ์ต๊ทผ 5๊ฐ๋ง
|
199 |
+
"last_updated": memory.last_updated
|
200 |
+
}
|
201 |
+
|
202 |
+
except Exception as e:
|
203 |
+
logger.error(f"โ ๋ฉ๋ชจ๋ฆฌ ์์ฝ ์์ฑ ์คํจ: {user_id} - {e}")
|
204 |
+
return {}
|
205 |
+
|
206 |
+
def _create_new_memory(self, user_id: str) -> UserMemory:
|
207 |
+
"""์ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์์ฑ"""
|
208 |
+
memory = UserMemory(
|
209 |
+
user_id=user_id,
|
210 |
+
created_at=time.time(),
|
211 |
+
last_updated=time.time()
|
212 |
+
)
|
213 |
+
|
214 |
+
# ๊ธฐ๋ณธ ์ค์
|
215 |
+
memory.language_preference = "ko"
|
216 |
+
memory.response_length_preference = "medium"
|
217 |
+
memory.ai_personality = "friendly"
|
218 |
+
memory.ai_response_style = "helpful"
|
219 |
+
|
220 |
+
# ํ์ผ์ ์ ์ฅ
|
221 |
+
self._save_memory_to_file(memory)
|
222 |
+
|
223 |
+
logger.info(f"๐ ์ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์์ฑ: {user_id}")
|
224 |
+
return memory
|
225 |
+
|
226 |
+
def _save_memory_to_file(self, memory: UserMemory) -> bool:
|
227 |
+
"""๋ฉ๋ชจ๋ฆฌ๋ฅผ ํ์ผ์ ์ ์ฅ"""
|
228 |
+
try:
|
229 |
+
file_path = self.storage_dir / f"{memory.user_id}.json"
|
230 |
+
|
231 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
232 |
+
json.dump(asdict(memory), f, ensure_ascii=False, indent=2)
|
233 |
+
|
234 |
+
return True
|
235 |
+
|
236 |
+
except Exception as e:
|
237 |
+
logger.error(f"โ ๋ฉ๋ชจ๋ฆฌ ํ์ผ ์ ์ฅ ์คํจ: {memory.user_id} - {e}")
|
238 |
+
return False
|
239 |
+
|
240 |
+
def _load_memory_from_file(self, user_id: str) -> Optional[UserMemory]:
|
241 |
+
"""ํ์ผ์์ ๋ฉ๋ชจ๋ฆฌ ๋ก๋"""
|
242 |
+
try:
|
243 |
+
file_path = self.storage_dir / f"{user_id}.json"
|
244 |
+
|
245 |
+
if not file_path.exists():
|
246 |
+
return None
|
247 |
+
|
248 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
249 |
+
data = json.load(f)
|
250 |
+
|
251 |
+
# UserMemory ๊ฐ์ฒด๋ก ๋ณํ
|
252 |
+
memory = UserMemory(**data)
|
253 |
+
logger.debug(f"๐ ํ์ผ์์ ๋ฉ๋ชจ๋ฆฌ ๋ก๋: {user_id}")
|
254 |
+
return memory
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
logger.error(f"โ ๋ฉ๋ชจ๋ฆฌ ํ์ผ ๋ก๋ ์คํจ: {user_id} - {e}")
|
258 |
+
return None
|
259 |
+
|
260 |
+
def _add_to_cache(self, user_id: str, memory: UserMemory):
|
261 |
+
"""์บ์์ ๋ฉ๋ชจ๋ฆฌ ์ถ๊ฐ (ํฌ๊ธฐ ์ ํ ํ์ธ)"""
|
262 |
+
if len(self.memory_cache) >= self.cache_size_limit:
|
263 |
+
# ๊ฐ์ฅ ์ค๋๋ ๋ฉ๋ชจ๋ฆฌ ์ ๊ฑฐ (LRU ๋ฐฉ์)
|
264 |
+
oldest_user = min(self.memory_cache.keys(),
|
265 |
+
key=lambda k: self.memory_cache[k].last_updated)
|
266 |
+
del self.memory_cache[oldest_user]
|
267 |
+
logger.debug(f"๐๏ธ ์บ์์์ ์ค๋๋ ๋ฉ๋ชจ๋ฆฌ ์ ๊ฑฐ: {oldest_user}")
|
268 |
+
|
269 |
+
self.memory_cache[user_id] = memory
|
270 |
+
|
271 |
+
def get_all_users(self) -> List[str]:
|
272 |
+
"""๋ชจ๋ ์ฌ์ฉ์ ID ๋ฐํ"""
|
273 |
+
try:
|
274 |
+
user_files = list(self.storage_dir.glob("*.json"))
|
275 |
+
user_ids = [f.stem for f in user_files]
|
276 |
+
return user_ids
|
277 |
+
|
278 |
+
except Exception as e:
|
279 |
+
logger.error(f"โ ์ฌ์ฉ์ ๋ชฉ๋ก ์กฐํ ์คํจ: {e}")
|
280 |
+
return []
|
281 |
+
|
282 |
+
def delete_user_memory(self, user_id: str) -> bool:
|
283 |
+
"""์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์ญ์ """
|
284 |
+
try:
|
285 |
+
# ์บ์์์ ์ ๊ฑฐ
|
286 |
+
if user_id in self.memory_cache:
|
287 |
+
del self.memory_cache[user_id]
|
288 |
+
|
289 |
+
# ํ์ผ ์ญ์
|
290 |
+
file_path = self.storage_dir / f"{user_id}.json"
|
291 |
+
if file_path.exists():
|
292 |
+
file_path.unlink()
|
293 |
+
|
294 |
+
logger.info(f"๐๏ธ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์ญ์ ์๋ฃ: {user_id}")
|
295 |
+
return True
|
296 |
+
|
297 |
+
except Exception as e:
|
298 |
+
logger.error(f"โ ์ฌ์ฉ์ ๋ฉ๋ชจ๋ฆฌ ์ญ์ ์คํจ: {user_id} - {e}")
|
299 |
+
return False
|
300 |
+
|
301 |
+
# ์ ์ญ ์ธ์คํด์ค
|
302 |
+
user_memory_manager = UserMemoryManager()
|
room_contexts/default.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"room_id": "default",
|
3 |
+
"created_at": 1755887199.998416,
|
4 |
+
"last_updated": 1755888375.434728,
|
5 |
+
"room_name": "์ฑํ
๋ฐฉ default",
|
6 |
+
"description": "์๋ก ์์ฑ๋ ์ฑํ
๋ฐฉ์
๋๋ค.",
|
7 |
+
"participants": [
|
8 |
+
"kdy"
|
9 |
+
],
|
10 |
+
"documents": [
|
11 |
+
{
|
12 |
+
"document_id": "32653a11",
|
13 |
+
"filename": "oop์ปคํผ๋งค๋์ .pdf",
|
14 |
+
"uploaded_at": 1755887816.48926,
|
15 |
+
"uploaded_by": "kdy",
|
16 |
+
"document_type": "pdf",
|
17 |
+
"page_count": 0,
|
18 |
+
"chunk_count": 0,
|
19 |
+
"summary": "๋ฌธ์๊ฐ ์ฑ๊ณต์ ์ผ๋ก ์ฒ๋ฆฌ๋์์ต๋๋ค.",
|
20 |
+
"tags": []
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"document_id": "8991b80f",
|
24 |
+
"filename": "test_math.pdf",
|
25 |
+
"uploaded_at": 1755887939.8704937,
|
26 |
+
"uploaded_by": "kdy",
|
27 |
+
"document_type": "pdf",
|
28 |
+
"page_count": 0,
|
29 |
+
"chunk_count": 0,
|
30 |
+
"summary": "๋ฌธ์๊ฐ ์ฑ๊ณต์ ์ผ๋ก ์ฒ๋ฆฌ๋์์ต๋๋ค.",
|
31 |
+
"tags": []
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"document_id": "8ed78561",
|
35 |
+
"filename": "test_design.pdf",
|
36 |
+
"uploaded_at": 1755888122.0620363,
|
37 |
+
"uploaded_by": "kdy",
|
38 |
+
"document_type": "pdf",
|
39 |
+
"page_count": 0,
|
40 |
+
"chunk_count": 0,
|
41 |
+
"summary": "๋ฌธ์๊ฐ ์ฑ๊ณต์ ์ผ๋ก ์ฒ๋ฆฌ๋์์ต๋๋ค.",
|
42 |
+
"tags": []
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"document_id": "c737d6d1",
|
46 |
+
"filename": "test_design.pdf",
|
47 |
+
"uploaded_at": 1755888375.4331617,
|
48 |
+
"uploaded_by": "kdy",
|
49 |
+
"document_type": "pdf",
|
50 |
+
"page_count": 0,
|
51 |
+
"chunk_count": 0,
|
52 |
+
"summary": "๋ฌธ์๊ฐ ์ฑ๊ณต์ ์ผ๋ก ์ฒ๋ฆฌ๋์์ต๋๋ค.",
|
53 |
+
"tags": []
|
54 |
+
}
|
55 |
+
],
|
56 |
+
"conversation_summary": null,
|
57 |
+
"key_topics": [],
|
58 |
+
"total_messages": 4,
|
59 |
+
"ai_context": {}
|
60 |
+
}
|
user_memories/anonymous.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"user_id": "anonymous",
|
3 |
+
"created_at": 1755887199.9962583,
|
4 |
+
"last_updated": 1755887199.9962583,
|
5 |
+
"name": null,
|
6 |
+
"preferences": {},
|
7 |
+
"important_info": [],
|
8 |
+
"communication_style": null,
|
9 |
+
"language_preference": "ko",
|
10 |
+
"response_length_preference": "medium",
|
11 |
+
"expertise_areas": [],
|
12 |
+
"interests": [],
|
13 |
+
"total_conversations": 0,
|
14 |
+
"total_messages": 0,
|
15 |
+
"favorite_topics": [],
|
16 |
+
"ai_personality": "friendly",
|
17 |
+
"ai_response_style": "helpful"
|
18 |
+
}
|
user_memories/kdy.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"user_id": "kdy",
|
3 |
+
"created_at": 1755887816.4909644,
|
4 |
+
"last_updated": 1755888375.434728,
|
5 |
+
"name": null,
|
6 |
+
"preferences": {},
|
7 |
+
"important_info": [],
|
8 |
+
"communication_style": null,
|
9 |
+
"language_preference": "ko",
|
10 |
+
"response_length_preference": "medium",
|
11 |
+
"expertise_areas": [],
|
12 |
+
"interests": [],
|
13 |
+
"total_conversations": 8,
|
14 |
+
"total_messages": 8,
|
15 |
+
"favorite_topics": [
|
16 |
+
"๋ฌธ์ ์
๋ก๋: oop์ปคํผ๋งค๋์ .pdf",
|
17 |
+
"๋ฌธ์ ์
๋ก๋: test_math.pdf",
|
18 |
+
"๋ฌธ์ ์
๋ก๋: test_design.pdf"
|
19 |
+
],
|
20 |
+
"ai_personality": "friendly",
|
21 |
+
"ai_response_style": "helpful"
|
22 |
+
}
|