gbrabbit commited on
Commit
d4fc1d3
ยท
1 Parent(s): 0b6f31f

Auto commit at 23-2025-08 3:51:46

Browse files
lily_llm_api/app_v2.py CHANGED
@@ -60,6 +60,9 @@ from lily_llm_core.hybrid_rag_processor import hybrid_rag_processor
60
  # ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž ๋ฐ LoRA ๊ด€๋ฆฌ์ž ์ถ”๊ฐ€
61
  from lily_llm_core.context_manager import get_context_manager, context_manager
62
 
 
 
 
63
  # ์ „์—ญ ๋ณ€์ˆ˜๋“ค
64
  current_model = None # ๐Ÿ”„ ํ˜„์žฌ ๋กœ๋“œ๋œ ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค
65
  current_profile = None # ๐Ÿ”„ ํ˜„์žฌ ์„ ํƒ๋œ ๋ชจ๋ธ ํ”„๋กœํ•„
@@ -491,7 +494,8 @@ def load_model_sync(model_id: str):
491
 
492
  def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
493
  temperature: Optional[float] = None, top_p: Optional[float] = None,
494
- do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None) -> dict:
 
495
  """[์ตœ์ ํ™”] ๋ชจ๋ธ ์ƒ์„ฑ์„ ์ฒ˜๋ฆฌํ•˜๋Š” ํ†ตํ•ฉ ๋™๊ธฐ ํ•จ์ˆ˜"""
496
  try:
497
  print(f"๐Ÿ” [DEBUG] generate_sync ์‹œ์ž‘ - prompt ๊ธธ์ด: {len(prompt)}")
@@ -515,16 +519,24 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
515
  combined_image_metas = None
516
 
517
  # --- 1. ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ (๊ณต์‹ ๋ฐฉ์‹) ---
518
- if image_data_list and len([img for img in image_data_list if img]) > 0 and getattr(current_profile, 'multimodal', False):
519
- print(f"๐Ÿ” [DEBUG] ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์‹œ์ž‘ - ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜: {len([img for img in image_data_list if img])}")
 
 
 
 
 
 
 
 
520
 
521
  # ๐Ÿ”„ ๊ณต์‹ ๋ฐฉ์‹: ๊ฐ„๋‹จํ•œ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ
522
- max_images = min(len(image_data_list), 4)
523
  logger.info(f"๐Ÿ–ผ๏ธ ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ฒ˜๋ฆฌ ์‹œ์ž‘... (์ด๋ฏธ์ง€ {max_images}๊ฐœ)")
524
 
525
  try:
526
  metas_list = []
527
- for idx, image_bytes in enumerate(image_data_list[:max_images]):
528
  if image_bytes:
529
  try:
530
  pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
@@ -568,31 +580,77 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
568
 
569
  # 2. RAG ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…์ŠคํŠธ (PDF ๋‚ด์šฉ ํฌํ•จ)
570
  try:
571
- # ๐Ÿ”„ ๊ฐ„๋‹จํ•˜๊ณ  ์ง์ ‘์ ์ธ RAG ์ปจํ…์ŠคํŠธ ๋กœ๋“œ
572
  rag_context = ""
573
 
574
- # ๐Ÿ”„ ๊ฐ„๋‹จํ•œ ํŒŒ์ผ ๊ธฐ๋ฐ˜ RAG ์ปจํ…์ŠคํŠธ ๋กœ๋“œ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  try:
576
- # vector_stores ๋””๋ ‰ํ† ๋ฆฌ์—์„œ ์ตœ๊ทผ ๋ฌธ์„œ ๋‚ด์šฉ ์ง์ ‘ ์ฝ๊ธฐ
577
- import os
578
- import json
579
-
580
- vector_store_dir = "vector_stores"
581
- if os.path.exists(vector_store_dir):
582
- user_dirs = [d for d in os.listdir(vector_store_dir) if os.path.isdir(os.path.join(vector_store_dir, d))]
583
- if user_dirs:
584
- user_dir = user_dirs[0] # ์ฒซ ๋ฒˆ์งธ ์‚ฌ์šฉ์ž
585
- user_path = os.path.join(vector_store_dir, user_dir)
 
 
 
 
586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  if os.path.exists(user_path):
588
- doc_dirs = [d for d in os.listdir(user_path) if os.path.isdir(os.path.join(user_path, d))]
589
- if doc_dirs:
590
- # ์ตœ๊ทผ ๋ฌธ์„œ ID ์‚ฌ์šฉ
591
- recent_doc_id = doc_dirs[-1]
592
- print(f"๐Ÿ” [DEBUG] RAG ์ปจํ…์ŠคํŠธ ๊ฒ€์ƒ‰: ์‚ฌ์šฉ์ž={user_dir}, ๋ฌธ์„œ={recent_doc_id}")
593
 
594
  # ๋ฌธ์„œ ๋‚ด์šฉ ํŒŒ์ผ ์ง์ ‘ ์ฝ๊ธฐ (pickle ํŒŒ์ผ ์ง€์›)
595
- doc_path = os.path.join(user_path, recent_doc_id)
596
  if os.path.exists(doc_path):
597
  # ๐Ÿ”„ pickle ํŒŒ์ผ์—์„œ ๋‚ด์šฉ ์ฝ๊ธฐ (์šฐ์„ )
598
  pickle_file = os.path.join(doc_path, "simple_vector_store.pkl")
@@ -607,18 +665,64 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
607
  documents_data = vector_store_data['documents']
608
  if documents_data and len(documents_data) > 0:
609
  rag_context = "\n\n๐Ÿ“„ ์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ๋‚ด์šฉ:\n"
 
 
 
610
  for i, doc in enumerate(documents_data[:2]): # ์ตœ๋Œ€ 2๊ฐœ ์ฒญํฌ
611
  if hasattr(doc, 'page_content'):
612
  content = doc.page_content.strip()
613
  if content and len(content) > 30:
614
- # ๋‚ด์šฉ์„ 200์ž๋กœ ์ œํ•œ
615
- truncated_content = content[:200] + "..." if len(content) > 200 else content
616
- rag_context += f"--- ์ฒญํฌ {i+1} ---\n{truncated_content}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
 
618
  if len(rag_context) > 30:
619
  context_prompt += rag_context
620
  print(f"๐Ÿ” [DEBUG] RAG ์ปจํ…์ŠคํŠธ ํฌํ•จ๋จ - ๊ธธ์ด: {len(rag_context)}")
621
  print(f"๐Ÿ” [DEBUG] RAG ์ปจํ…์ŠคํŠธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {rag_context[:100]}...")
 
 
 
 
 
 
622
  else:
623
  print(f"โš ๏ธ [DEBUG] RAG ์ปจํ…์ŠคํŠธ๊ฐ€ ๋„ˆ๋ฌด ์งง์Œ: {len(rag_context)}")
624
  else:
@@ -731,7 +835,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
731
  # --- 3. ํ† ํฌ๋‚˜์ด์ง• ---
732
  print(f"๐Ÿ” [DEBUG] ํ† ํฌ๋‚˜์ด์ง• ์‹œ์ž‘")
733
  t_tok_start = time.time()
734
- if not image_data_list or len([img for img in image_data_list if img]) == 0:
735
  # ํ…์ŠคํŠธ-only ๊ณ ์ • ๊ฒฝ๋กœ (๋” ๋น ๋ฆ„)
736
  print(f"๐Ÿ” [DEBUG] ํ…์ŠคํŠธ-only ํ† ํฌ๋‚˜์ด์ง• ๊ฒฝ๋กœ")
737
  print(f"๐Ÿ” [DEBUG] ์‚ฌ์šฉํ•  ํ”„๋กฌํ”„ํŠธ: {formatted_prompt}")
@@ -754,6 +858,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
754
  # ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ(Lite): Kanana ์ „์šฉ encode_prompt๋กœ -1 ํ† ํฐ ์ž๋ฆฌ ์ƒ์„ฑ (ํ•„์ˆ˜)
755
  print(f"๐Ÿ” [DEBUG] ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ํ† ํฌ๋‚˜์ด์ง• ๊ฒฝ๋กœ")
756
  print(f"๐Ÿ” [DEBUG] combined_image_metas: {combined_image_metas}")
 
757
 
758
  if hasattr(tokenizer, 'encode_prompt'):
759
  print(f"๐Ÿ” [DEBUG] encode_prompt ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ")
@@ -1352,6 +1457,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
1352
  total_time = time.time() - t_tok_start
1353
  print(f"๐Ÿ” [DEBUG] ์ „์ฒด ์ฒ˜๋ฆฌ ์™„๋ฃŒ - ์ด ์†Œ์š”์‹œ๊ฐ„: {total_time:.3f}์ดˆ")
1354
 
 
 
1355
  return {
1356
  "generated_text": response,
1357
  "processing_time": total_time,
@@ -1527,6 +1634,8 @@ async def generate(request: Request,
1527
  image2: UploadFile = File(None),
1528
  image3: UploadFile = File(None),
1529
  image4: UploadFile = File(None),
 
 
1530
  use_context: bool = Form(True),
1531
  session_id: str = Form(None)):
1532
 
@@ -1535,20 +1644,12 @@ async def generate(request: Request,
1535
 
1536
  start_time = time.time()
1537
 
1538
- # ์„ธ์…˜ ID๊ฐ€ ์—†์œผ๋ฉด ์ž๋™ ์ƒ์„ฑ (ํด๋ผ์ด์–ธํŠธ๋ณ„ ๊ณ ์œ  ์„ธ์…˜)
1539
  if not session_id:
1540
- # ํด๋ผ์ด์–ธํŠธ IP ๊ธฐ๋ฐ˜์œผ๋กœ ๊ณ ์œ ํ•œ ์„ธ์…˜ ์ƒ์„ฑ (๊ฐ™์€ ํด๋ผ์ด์–ธํŠธ๋Š” ๊ฐ™์€ ์„ธ์…˜ ์œ ์ง€)
1541
- client_ip = "unknown"
1542
- try:
1543
- # Request ๊ฐ์ฒด์—์„œ ํด๋ผ์ด์–ธํŠธ IP ์ถ”์ถœ
1544
- client_ip = request.client.host if request.client else "unknown"
1545
- except:
1546
- pass
1547
-
1548
- # ํด๋ผ์ด์–ธํŠธ IP + ์‹œ๊ฐ„ ๊ธฐ๋ฐ˜์œผ๋กœ ์„ธ์…˜ ์ƒ์„ฑ (ํ•˜๋ฃจ ๋™์•ˆ ์œ ์ง€)
1549
- day_timestamp = int(time.time() // 86400) * 86400 # ํ•˜๋ฃจ ๋‹จ์œ„๋กœ ๋ฐ˜์˜ฌ๋ฆผ
1550
- session_id = f"client_{client_ip}_{day_timestamp}"
1551
- print(f"๐Ÿ” [DEBUG] ์ž๋™ ์„ธ์…˜ ID ์ƒ์„ฑ: {session_id} (ํด๋ผ์ด์–ธํŠธ: {client_ip})")
1552
 
1553
  if use_context:
1554
  context_manager.add_user_message(prompt, metadata={"session_id": session_id})
@@ -1566,7 +1667,7 @@ async def generate(request: Request,
1566
 
1567
  try:
1568
  # generate_sync ํ•จ์ˆ˜ ํ˜ธ์ถœ (์ปจํ…์ŠคํŠธ ํฌํ•จ)
1569
- result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id)
1570
 
1571
  if "error" in result:
1572
  raise HTTPException(status_code=500, detail=result["error"])
@@ -1716,6 +1817,7 @@ async def health_check():
1716
  async def upload_document(
1717
  file: UploadFile = File(...),
1718
  user_id: str = Form("default_user"), # ๊ธฐ๋ณธ ์‚ฌ์šฉ์ž ID
 
1719
  document_id: Optional[str] = Form(None) # ๋ฌธ์„œ ID (์ž๋™ ์ƒ์„ฑ ๊ฐ€๋Šฅ)
1720
  ):
1721
  """๋ฌธ์„œ ์—…๋กœ๋“œ ๋ฐ RAG ์ฒ˜๋ฆฌ"""
@@ -1746,73 +1848,47 @@ async def upload_document(
1746
  processing_time = time.time() - start_time
1747
  logger.info(f"๐Ÿ“„ ๋ฌธ์„œ ์—…๋กœ๋“œ ์™„๋ฃŒ ({processing_time:.2f}์ดˆ): {file.filename}")
1748
 
1749
- # ๋ฌธ์„œ ์—…๋กœ๋“œ ํ›„ ์ž๋™์œผ๋กœ AI ์‘๋‹ต ์ƒ์„ฑ
1750
  if result["success"]:
1751
  try:
1752
- # ๊ฐ„๋‹จํ•œ ์š”์•ฝ ์งˆ๋ฌธ์œผ๋กœ AI ์‘๋‹ต ์ƒ์„ฑ
1753
- summary_query = f"์—…๋กœ๋“œ๋œ ๋ฌธ์„œ '{file.filename}'์˜ ์ฃผ์š” ๋‚ด์šฉ์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š”."
 
1754
 
1755
- logger.info(f"๐Ÿค– ๋ฌธ์„œ ์—…๋กœ๋“œ ํ›„ AI ์‘๋‹ต ์ƒ์„ฑ ์‹œ์ž‘...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1756
 
1757
- # ๐Ÿ”„ RAG ์‹œ์Šคํ…œ์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ฌธ์„œ ๋‚ด์šฉ ๊ธฐ๋ฐ˜ ์‘๋‹ต ์ƒ์„ฑ
1758
- try:
1759
- # RAG ๊ฒ€์ƒ‰์„ ํ†ตํ•ด ๋ฌธ์„œ ๋‚ด์šฉ์„ ํฌํ•จํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
1760
- summary_query = f"์—…๋กœ๋“œ๋œ ๋ฌธ์„œ '{file.filename}'์˜ ์ฃผ์š” ๋‚ด์šฉ์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š”."
1761
-
1762
- # RAG ์‘๋‹ต ์ƒ์„ฑ (๋ฌธ์„œ ๋‚ด์šฉ ํฌํ•จ)
1763
- rag_result = rag_processor.generate_rag_response(
1764
- user_id, document_id, summary_query, llm_model=model
1765
- )
1766
-
1767
- if rag_result["success"]:
1768
- logger.info(f"โœ… ์ž๋™ AI ์‘๋‹ต ์ƒ์„ฑ ์™„๋ฃŒ: {len(rag_result['response'])} ๋ฌธ์ž")
1769
- result["auto_response"] = rag_result["response"]
1770
- else:
1771
- logger.warning(f"โš ๏ธ ์ž๋™ AI ์‘๋‹ต ์ƒ์„ฑ ์‹คํŒจ: {rag_result.get('error', 'Unknown error')}")
1772
- # ๐Ÿ”„ RAG ์‹คํŒจ ์‹œ generate_sync ์‚ฌ์šฉ (fallback)
1773
- if model and hasattr(model, 'generate'):
1774
- try:
1775
- from .app_v2 import generate_sync
1776
-
1777
- # ๋ฌธ์„œ ๋‚ด์šฉ์„ ํฌํ•จํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
1778
- context_prompt = f"""
1779
- ๋‹ค์Œ ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ๋ฐ”ํƒ•์œผ๋กœ ์š”์•ฝํ•ด์ฃผ์„ธ์š”:
1780
-
1781
- ๋ฌธ์„œ๋ช…: {file.filename}
1782
- ๋ฌธ์„œ ID: {document_id}
1783
-
1784
- ๋ฌธ์„œ ๋‚ด์šฉ:
1785
- {result.get('chunks', [])}
1786
-
1787
- ์œ„ ๋ฌธ์„œ์˜ ์ฃผ์š” ๋‚ด์šฉ์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š”.
1788
- """
1789
-
1790
- response = generate_sync(
1791
- prompt=context_prompt,
1792
- image_data_list=None,
1793
- session_id=None
1794
- )
1795
-
1796
- if response and "response" in response:
1797
- result["auto_response"] = response["response"]
1798
- logger.info(f"โœ… fallback AI ์‘๋‹ต ์ƒ์„ฑ ์™„๋ฃŒ: {len(response['response'])} ๋ฌธ์ž")
1799
- else:
1800
- result["auto_response"] = "๋ฌธ์„œ ์š”์•ฝ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
1801
- logger.warning(f"โš ๏ธ generate_sync ์‘๋‹ต ํ˜•์‹ ์˜ค๋ฅ˜")
1802
-
1803
- except Exception as e:
1804
- logger.error(f"โŒ fallback generate_sync ํ˜ธ์ถœ ์‹คํŒจ: {e}")
1805
- result["auto_response"] = "๋ฌธ์„œ ์š”์•ฝ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค."
1806
- else:
1807
- result["auto_response"] = "๋ฌธ์„œ ์š”์•ฝ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
1808
-
1809
- except Exception as e:
1810
- logger.error(f"โŒ RAG ์‘๋‹ต ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}")
1811
- result["auto_response"] = "๋ฌธ์„œ ์š”์•ฝ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค."
1812
-
1813
  except Exception as e:
1814
- logger.error(f"โŒ ์ž๋™ AI ์‘๋‹ต ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}")
1815
- result["auto_response"] = "๋ฌธ์„œ ์š”์•ฝ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
 
1816
 
1817
  return DocumentUploadResponse(
1818
  success=result["success"],
 
60
  # ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž ๋ฐ LoRA ๊ด€๋ฆฌ์ž ์ถ”๊ฐ€
61
  from lily_llm_core.context_manager import get_context_manager, context_manager
62
 
63
+ # ๊ณ„์ธต์  ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ ์ถ”๊ฐ€
64
+ from lily_llm_core.integrated_memory_manager import integrated_memory_manager
65
+
66
  # ์ „์—ญ ๋ณ€์ˆ˜๋“ค
67
  current_model = None # ๐Ÿ”„ ํ˜„์žฌ ๋กœ๋“œ๋œ ๋ชจ๋ธ ์ธ์Šคํ„ด์Šค
68
  current_profile = None # ๐Ÿ”„ ํ˜„์žฌ ์„ ํƒ๋œ ๋ชจ๋ธ ํ”„๋กœํ•„
 
494
 
495
  def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
496
  temperature: Optional[float] = None, top_p: Optional[float] = None,
497
+ do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
498
+ user_id: str = "anonymous", room_id: str = "default") -> dict:
499
  """[์ตœ์ ํ™”] ๋ชจ๋ธ ์ƒ์„ฑ์„ ์ฒ˜๋ฆฌํ•˜๋Š” ํ†ตํ•ฉ ๋™๊ธฐ ํ•จ์ˆ˜"""
500
  try:
501
  print(f"๐Ÿ” [DEBUG] generate_sync ์‹œ์ž‘ - prompt ๊ธธ์ด: {len(prompt)}")
 
519
  combined_image_metas = None
520
 
521
  # --- 1. ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ (๊ณต์‹ ๋ฐฉ์‹) ---
522
+ # ๐Ÿ”„ RAG์—์„œ ์ถ”์ถœ๋œ ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋„ ํฌํ•จ
523
+ all_image_data = []
524
+ if image_data_list and len([img for img in image_data_list if img]) > 0:
525
+ all_image_data.extend(image_data_list)
526
+ print(f"๐Ÿ” [DEBUG] ์ง์ ‘ ์ „๋‹ฌ๋œ ์ด๋ฏธ์ง€ {len(image_data_list)}๊ฐœ ์ถ”๊ฐ€")
527
+
528
+ # ๐Ÿ”„ RAG์—์„œ ์ถ”์ถœ๋œ ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋Š” ํ˜„์žฌ ๊ตฌํ˜„์—์„œ ์ œ๊ฑฐ๋จ (์ „์—ญ ๋ณ€์ˆ˜ ๋ฌธ์ œ ํ•ด๊ฒฐ)
529
+
530
+ if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
531
+ print(f"๐Ÿ” [DEBUG] ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์‹œ์ž‘ - ์ด ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜: {len([img for img in all_image_data if img])}")
532
 
533
  # ๐Ÿ”„ ๊ณต์‹ ๋ฐฉ์‹: ๊ฐ„๋‹จํ•œ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ
534
+ max_images = min(len(all_image_data), 4)
535
  logger.info(f"๐Ÿ–ผ๏ธ ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ฒ˜๋ฆฌ ์‹œ์ž‘... (์ด๋ฏธ์ง€ {max_images}๊ฐœ)")
536
 
537
  try:
538
  metas_list = []
539
+ for idx, image_bytes in enumerate(all_image_data[:max_images]):
540
  if image_bytes:
541
  try:
542
  pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 
580
 
581
  # 2. RAG ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…์ŠคํŠธ (PDF ๋‚ด์šฉ ํฌํ•จ)
582
  try:
583
+ # ๐Ÿ”„ ์ƒˆ๋กœ์šด ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ์„ ์‚ฌ์šฉํ•œ RAG ์ปจํ…์ŠคํŠธ ๋กœ๋“œ
584
  rag_context = ""
585
 
586
+ # ํ†ตํ•ฉ ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ์ž์—์„œ AI์šฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
587
+ ai_context = integrated_memory_manager.get_context_for_ai(
588
+ user_id=user_id,
589
+ room_id=room_id,
590
+ session_id=session_id,
591
+ include_user_memory=True,
592
+ include_room_context=True,
593
+ include_session_history=False # ํ˜„์žฌ ๋Œ€ํ™”๋Š” ๋ณ„๋„๋กœ ์ฒ˜๋ฆฌ
594
+ )
595
+
596
+ if ai_context:
597
+ rag_context += f"\n\n๐Ÿ”— ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ:\n{ai_context}\n"
598
+ print(f"๐Ÿ” [DEBUG] ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ํฌํ•จ๋จ - ๊ธธ์ด: {len(ai_context)}")
599
+
600
+ # ๊ธฐ์กด RAG ์‹œ์Šคํ…œ์—์„œ ๋ฌธ์„œ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ (room_id ๊ธฐ๋ฐ˜)
601
  try:
602
+ # ์ฑ„ํŒ…๋ฐฉ๋ณ„ ๋ฌธ์„œ ์ปจํ…์ŠคํŠธ ์กฐํšŒ
603
+ room_context = integrated_memory_manager.room_context_manager.get_room_context(room_id)
604
+ if room_context and room_context.documents:
605
+ rag_context += "\n\n๐Ÿ“„ ์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ๋ชฉ๋ก:\n"
606
+ for doc in room_context.documents[-3:]: # ์ตœ๊ทผ 3๊ฐœ๋งŒ
607
+ # ๋”•์…”๋„ˆ๋ฆฌ์™€ ๊ฐ์ฒด ๋ชจ๋‘ ์ฒ˜๋ฆฌ
608
+ if isinstance(doc, dict):
609
+ filename = doc.get('filename', 'unknown')
610
+ doc_type = doc.get('document_type', 'unknown')
611
+ page_count = doc.get('page_count', 0)
612
+ else:
613
+ filename = getattr(doc, 'filename', 'unknown')
614
+ doc_type = getattr(doc, 'document_type', 'unknown')
615
+ page_count = getattr(doc, 'page_count', 0)
616
 
617
+ rag_context += f" - {filename} ({doc_type}, {page_count}ํŽ˜์ด์ง€)\n"
618
+
619
+ print(f"๐Ÿ” [DEBUG] ์ฑ„ํŒ…๋ฐฉ {room_id}์˜ ๋ฌธ์„œ {len(room_context.documents)}๊ฐœ ๋ฐœ๊ฒฌ")
620
+
621
+ except Exception as e:
622
+ print(f"โš ๏ธ ์ฑ„ํŒ…๋ฐฉ ๋ฌธ์„œ ์ปจํ…์ŠคํŠธ ๋กœ๋“œ ์‹คํŒจ: {e}")
623
+
624
+ # ์ƒˆ๋กœ์šด ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ ๊ธฐ๋ฐ˜ RAG ์ปจํ…์ŠคํŠธ ๋กœ๋“œ
625
+ try:
626
+ # ํ˜„์žฌ ์ฑ„ํŒ…๋ฐฉ์˜ ์ตœ์‹  ๋ฌธ์„œ ID ์‚ฌ์šฉ
627
+ room_context = integrated_memory_manager.room_context_manager.get_room_context(room_id)
628
+ if room_context and room_context.documents:
629
+ # ๊ฐ€์žฅ ์ตœ๊ทผ์— ์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ์‚ฌ์šฉ
630
+ latest_doc = room_context.documents[-1]
631
+
632
+ # ๋”•์…”๋„ˆ๋ฆฌ์™€ ๊ฐ์ฒด ๋ชจ๋‘ ์ฒ˜๋ฆฌ
633
+ if isinstance(latest_doc, dict):
634
+ latest_doc_id = latest_doc.get('document_id', 'unknown')
635
+ latest_user_id = latest_doc.get('uploaded_by', 'unknown')
636
+ else:
637
+ latest_doc_id = getattr(latest_doc, 'document_id', 'unknown')
638
+ latest_user_id = getattr(latest_doc, 'uploaded_by', 'unknown')
639
+
640
+ print(f"๐Ÿ” [DEBUG] ์ƒˆ๋กœ์šด RAG ์ปจํ…์ŠคํŠธ ๊ฒ€์ƒ‰: ์ฑ„ํŒ…๋ฐฉ={room_id}, ์‚ฌ์šฉ์ž={latest_user_id}, ๋ฌธ์„œ={latest_doc_id}")
641
+
642
+ # vector_stores ๋””๋ ‰ํ† ๋ฆฌ์—์„œ ํ•ด๋‹น ๋ฌธ์„œ ๋‚ด์šฉ ์ฝ๊ธฐ
643
+ import os
644
+ import json
645
+
646
+ vector_store_dir = "vector_stores"
647
+ if os.path.exists(vector_store_dir):
648
+ user_path = os.path.join(vector_store_dir, latest_user_id)
649
  if os.path.exists(user_path):
650
+ doc_path = os.path.join(user_path, latest_doc_id)
651
+ if os.path.exists(doc_path):
 
 
 
652
 
653
  # ๋ฌธ์„œ ๋‚ด์šฉ ํŒŒ์ผ ์ง์ ‘ ์ฝ๊ธฐ (pickle ํŒŒ์ผ ์ง€์›)
 
654
  if os.path.exists(doc_path):
655
  # ๐Ÿ”„ pickle ํŒŒ์ผ์—์„œ ๋‚ด์šฉ ์ฝ๊ธฐ (์šฐ์„ )
656
  pickle_file = os.path.join(doc_path, "simple_vector_store.pkl")
 
665
  documents_data = vector_store_data['documents']
666
  if documents_data and len(documents_data) > 0:
667
  rag_context = "\n\n๐Ÿ“„ ์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ๋‚ด์šฉ:\n"
668
+
669
+ # ๐Ÿš€ ์‹ค๋ฌด ์ˆ˜์ค€ ๊ตฌ์กฐํ™”๋œ PDF ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
670
+ extracted_images = []
671
  for i, doc in enumerate(documents_data[:2]): # ์ตœ๋Œ€ 2๊ฐœ ์ฒญํฌ
672
  if hasattr(doc, 'page_content'):
673
  content = doc.page_content.strip()
674
  if content and len(content) > 30:
675
+ # ๊ตฌ์กฐํ™”๋œ ์ปจํ…์ธ ์ธ์ง€ ํ™•์ธ
676
+ if "=== ํŽ˜์ด์ง€" in content and "[ํ…์ŠคํŠธ ๋ธ”๋ก" in content:
677
+ # ์‹ค๋ฌด ์ˆ˜์ค€ ๊ตฌ์กฐํ™”๋œ ์ปจํ…์ธ 
678
+ truncated_content = content[:400] + "..." if len(content) > 400 else content
679
+ rag_context += f"--- ๊ตฌ์กฐํ™”๋œ ํŽ˜์ด์ง€ {i+1} ---\n{truncated_content}\n\n"
680
+ print(f"๐Ÿš€ [DEBUG] ๊ตฌ์กฐํ™”๋œ PDF ํŽ˜์ด์ง€ ๋ฐ์ดํ„ฐ ๋กœ๋“œ๋จ")
681
+ else:
682
+ # ๊ธฐ์กด ๋ฐฉ์‹
683
+ truncated_content = content[:200] + "..." if len(content) > 200 else content
684
+ rag_context += f"--- ์ฒญํฌ {i+1} ---\n{truncated_content}\n\n"
685
+
686
+ # ๐Ÿš€ ๊ตฌ์กฐํ™”๋œ ์ด๋ฏธ์ง€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
687
+ if hasattr(doc, 'metadata') and doc.metadata:
688
+ metadata = doc.metadata
689
+
690
+ # ์‹ค๋ฌด ์ˆ˜์ค€ ๊ตฌ์กฐํ™”๋œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํ™•์ธ
691
+ if metadata.get('structured_analysis') and metadata.get('spatial_relationships'):
692
+ print(f"๐Ÿš€ [DEBUG] ์‹ค๋ฌด ์ˆ˜์ค€ ๊ตฌ์กฐํ™”๋œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋ฐœ๊ฒฌ")
693
+
694
+ # ์ด๋ฏธ์ง€ ๋ธ”๋ก ์ •๋ณด ์ถœ๋ ฅ
695
+ if 'image_blocks' in metadata:
696
+ image_blocks = metadata['image_blocks']
697
+ for ib in image_blocks:
698
+ print(f"๐Ÿ–ผ๏ธ [DEBUG] ์ด๋ฏธ์ง€ ๋ธ”๋ก: {ib['block_id']}, "
699
+ f"์œ„์น˜: ({ib['bbox']['x0']:.1f}, {ib['bbox']['y0']:.1f}), "
700
+ f"๊ด€๋ จ ํ…์ŠคํŠธ: {ib['related_text_count']}๊ฐœ")
701
+
702
+ # ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ ์ถ”์ถœ (๊ธฐ์กด ๋ฐฉ์‹ + ์ƒˆ๋กœ์šด ๋ฐฉ์‹ ๋ชจ๋‘ ์ง€์›)
703
+ if metadata.get('multimodal_ready') and 'image_data_list' in metadata:
704
+ image_data_list = metadata['image_data_list']
705
+ if image_data_list and len(image_data_list) > 0:
706
+ extracted_images.extend(image_data_list)
707
+ print(f"๐Ÿ” [DEBUG] ์ฒญํฌ {i+1}์—์„œ ์ด๋ฏธ์ง€ {len(image_data_list)}๊ฐœ ์ถ”์ถœ๋จ")
708
+
709
+ # ์ด๋ฏธ์ง€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋„ ์ถœ๋ ฅ (๊ตฌ์กฐํ™”๋œ ๊ฒฝ์šฐ)
710
+ if 'image_metadata' in metadata:
711
+ for img_meta in metadata['image_metadata'][:2]:
712
+ print(f"๐Ÿ–ผ๏ธ [DEBUG] ์ด๋ฏธ์ง€ ์ƒ์„ธ: {img_meta['block_id']}, "
713
+ f"ํฌ๊ธฐ: {img_meta['size']}, "
714
+ f"๊ด€๋ จ ํ…์ŠคํŠธ: {len(img_meta.get('related_texts', []))}๊ฐœ")
715
 
716
  if len(rag_context) > 30:
717
  context_prompt += rag_context
718
  print(f"๐Ÿ” [DEBUG] RAG ์ปจํ…์ŠคํŠธ ํฌํ•จ๋จ - ๊ธธ์ด: {len(rag_context)}")
719
  print(f"๐Ÿ” [DEBUG] RAG ์ปจํ…์ŠคํŠธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {rag_context[:100]}...")
720
+
721
+ # ๐Ÿ”„ ์ถ”์ถœ๋œ ์ด๋ฏธ์ง€๊ฐ€ ์žˆ์œผ๋ฉด ๋กœ๊ทธ ์ถœ๋ ฅ
722
+ if extracted_images:
723
+ print(f"๐Ÿ” [DEBUG] ์ด {len(extracted_images)}๊ฐœ ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ ์ถ”์ถœ ์™„๋ฃŒ")
724
+ else:
725
+ print(f"๐Ÿ” [DEBUG] ์ถ”์ถœ๋œ ์ด๋ฏธ์ง€ ์—†์Œ")
726
  else:
727
  print(f"โš ๏ธ [DEBUG] RAG ์ปจํ…์ŠคํŠธ๊ฐ€ ๋„ˆ๋ฌด ์งง์Œ: {len(rag_context)}")
728
  else:
 
835
  # --- 3. ํ† ํฌ๋‚˜์ด์ง• ---
836
  print(f"๐Ÿ” [DEBUG] ํ† ํฌ๋‚˜์ด์ง• ์‹œ์ž‘")
837
  t_tok_start = time.time()
838
+ if not all_image_data or len([img for img in all_image_data if img]) == 0:
839
  # ํ…์ŠคํŠธ-only ๊ณ ์ • ๊ฒฝ๋กœ (๋” ๋น ๋ฆ„)
840
  print(f"๐Ÿ” [DEBUG] ํ…์ŠคํŠธ-only ํ† ํฌ๋‚˜์ด์ง• ๊ฒฝ๋กœ")
841
  print(f"๐Ÿ” [DEBUG] ์‚ฌ์šฉํ•  ํ”„๋กฌํ”„ํŠธ: {formatted_prompt}")
 
858
  # ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ(Lite): Kanana ์ „์šฉ encode_prompt๋กœ -1 ํ† ํฐ ์ž๋ฆฌ ์ƒ์„ฑ (ํ•„์ˆ˜)
859
  print(f"๐Ÿ” [DEBUG] ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ํ† ํฌ๋‚˜์ด์ง• ๊ฒฝ๋กœ")
860
  print(f"๐Ÿ” [DEBUG] combined_image_metas: {combined_image_metas}")
861
+ print(f"๐Ÿ” [DEBUG] ์ด ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜: {len(all_image_data)}")
862
 
863
  if hasattr(tokenizer, 'encode_prompt'):
864
  print(f"๐Ÿ” [DEBUG] encode_prompt ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ")
 
1457
  total_time = time.time() - t_tok_start
1458
  print(f"๐Ÿ” [DEBUG] ์ „์ฒด ์ฒ˜๋ฆฌ ์™„๋ฃŒ - ์ด ์†Œ์š”์‹œ๊ฐ„: {total_time:.3f}์ดˆ")
1459
 
1460
+ # ๐Ÿ”„ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์™„๋ฃŒ (์ „์—ญ ๋ณ€์ˆ˜ ์ดˆ๊ธฐํ™”๋Š” ์ œ๊ฑฐ๋จ)
1461
+
1462
  return {
1463
  "generated_text": response,
1464
  "processing_time": total_time,
 
1634
  image2: UploadFile = File(None),
1635
  image3: UploadFile = File(None),
1636
  image4: UploadFile = File(None),
1637
+ user_id: str = Form("anonymous"),
1638
+ room_id: str = Form("default"),
1639
  use_context: bool = Form(True),
1640
  session_id: str = Form(None)):
1641
 
 
1644
 
1645
  start_time = time.time()
1646
 
1647
+ # ์„ธ์…˜ ID๊ฐ€ ์—†์œผ๋ฉด ์ž๋™ ์ƒ์„ฑ (์ฑ„ํŒ…๋ฐฉ๋ณ„ ๊ณ ์œ  ์„ธ์…˜)
1648
  if not session_id:
1649
+ # ์ฑ„ํŒ…๋ฐฉ + ์‚ฌ์šฉ์ž + ํƒ€์ž„์Šคํƒฌํ”„ ๊ธฐ๋ฐ˜์œผ๋กœ ๊ณ ์œ ํ•œ ์„ธ์…˜ ์ƒ์„ฑ
1650
+ timestamp = int(time.time())
1651
+ session_id = f"room_{room_id}_user_{user_id}_{timestamp}"
1652
+ print(f"๐Ÿ” [DEBUG] ์ž๋™ ์„ธ์…˜ ID ์ƒ์„ฑ: {session_id} (์ฑ„ํŒ…๋ฐฉ: {room_id}, ์‚ฌ์šฉ์ž: {user_id})")
 
 
 
 
 
 
 
 
1653
 
1654
  if use_context:
1655
  context_manager.add_user_message(prompt, metadata={"session_id": session_id})
 
1667
 
1668
  try:
1669
  # generate_sync ํ•จ์ˆ˜ ํ˜ธ์ถœ (์ปจํ…์ŠคํŠธ ํฌํ•จ)
1670
+ result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id)
1671
 
1672
  if "error" in result:
1673
  raise HTTPException(status_code=500, detail=result["error"])
 
1817
  async def upload_document(
1818
  file: UploadFile = File(...),
1819
  user_id: str = Form("default_user"), # ๊ธฐ๋ณธ ์‚ฌ์šฉ์ž ID
1820
+ room_id: str = Form("default"), # ์ฑ„ํŒ…๋ฐฉ ID
1821
  document_id: Optional[str] = Form(None) # ๋ฌธ์„œ ID (์ž๋™ ์ƒ์„ฑ ๊ฐ€๋Šฅ)
1822
  ):
1823
  """๋ฌธ์„œ ์—…๋กœ๋“œ ๋ฐ RAG ์ฒ˜๋ฆฌ"""
 
1848
  processing_time = time.time() - start_time
1849
  logger.info(f"๐Ÿ“„ ๋ฌธ์„œ ์—…๋กœ๋“œ ์™„๋ฃŒ ({processing_time:.2f}์ดˆ): {file.filename}")
1850
 
1851
+ # ์ƒˆ๋กœ์šด ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ์— ๋ฌธ์„œ ์ •๋ณด ์ถ”๊ฐ€
1852
  if result["success"]:
1853
  try:
1854
+ # ๋ฌธ์„œ ์ •๋ณด๋ฅผ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ์— ์ถ”๊ฐ€
1855
+ chunks = result.get("chunks", [])
1856
+ chunk_count = len(chunks) if isinstance(chunks, list) else 0
1857
 
1858
+ document_info = {
1859
+ "document_id": document_id,
1860
+ "filename": file.filename,
1861
+ "uploaded_by": user_id,
1862
+ "document_type": file.filename.split('.')[-1].lower() if '.' in file.filename else "unknown",
1863
+ "page_count": result.get("page_count", 0),
1864
+ "chunk_count": chunk_count,
1865
+ "summary": result.get("message", "")
1866
+ }
1867
+
1868
+ # ํ†ตํ•ฉ ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ์ž์— ๋ฌธ์„œ ์ถ”๊ฐ€
1869
+ integrated_memory_manager.add_document_to_room(room_id, document_info)
1870
+
1871
+ # ์‚ฌ์šฉ์ž ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
1872
+ integrated_memory_manager.record_conversation(
1873
+ user_id, room_id,
1874
+ topic=f"๋ฌธ์„œ ์—…๋กœ๋“œ: {file.filename}"
1875
+ )
1876
+
1877
+ logger.info(f"โœ… ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ์— ๋ฌธ์„œ ์ •๋ณด ์ถ”๊ฐ€ ์™„๋ฃŒ: {room_id} - {file.filename}")
1878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1879
  except Exception as e:
1880
+ logger.warning(f"โš ๏ธ ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ ์—…๋ฐ์ดํŠธ ์‹คํŒจ: {e}")
1881
+
1882
+ # ๋ฌธ์„œ ์—…๋กœ๋“œ ํ›„ ์ž๋™ AI ์‘๋‹ต ์ƒ์„ฑ ๋น„ํ™œ์„ฑํ™” (AI ๋ฆฌ์†Œ์Šค ์ ˆ์•ฝ)
1883
+ # ์‚ฌ์šฉ์ž๊ฐ€ ์ง์ ‘ ์งˆ๋ฌธํ•  ๋•Œ๋งŒ AI ์‘๋‹ต ์ƒ์„ฑ
1884
+ auto_generate_response = False
1885
+
1886
+ if result["success"]:
1887
+ # ์ž๋™ AI ์š”์•ฝ ์—†์ด ๋ฌธ์„œ ์—…๋กœ๋“œ๋งŒ ์™„๋ฃŒ
1888
+ result["auto_response"] = f"๋ฌธ์„œ '{file.filename}' ์—…๋กœ๋“œ ์™„๋ฃŒ! ์ด์ œ ์งˆ๋ฌธํ•ด์ฃผ์„ธ์š”."
1889
+ logger.info(f"๐Ÿ“„ ์ž๋™ AI ์‘๋‹ต ์ƒ์„ฑ ๊ฑด๋„ˆ๋›ฐ๊ธฐ - AI ๋ฆฌ์†Œ์Šค ์ ˆ์•ฝ (์‚ฌ์šฉ์ž ์งˆ๋ฌธ ์‹œ์—๋งŒ AI ์‘๋‹ต)")
1890
+ else:
1891
+ result["auto_response"] = "๋ฌธ์„œ ์—…๋กœ๋“œ์— ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค."
1892
 
1893
  return DocumentUploadResponse(
1894
  success=result["success"],
lily_llm_core/document_processor.py CHANGED
@@ -2,41 +2,34 @@
2
  """
3
  ๋ฌธ์„œ ์ฒ˜๋ฆฌ ๋ชจ๋“ˆ
4
  PDF, DOCX, PPTX ๋“ฑ ๋‹ค์–‘ํ•œ ๋ฌธ์„œ ํ˜•์‹์„ ์ฒ˜๋ฆฌ
 
5
  """
6
 
7
  import os
8
  import logging
9
- from typing import List, Dict, Any, Optional
10
  from pathlib import Path
11
  import easyocr
12
  import re
13
  import base64
14
  import io
 
 
15
 
16
  # LangChain ๋ฌธ์„œ ๋กœ๋”๋“ค
17
  try:
18
  from langchain_community.document_loaders import (
19
  PyMuPDFLoader,
20
  UnstructuredWordDocumentLoader,
21
- UnstructuredPowerPointLoader,
22
- UnstructuredFileLoader
23
  )
24
- except ImportError as e:
25
- logger.error(f"โŒ LangChain ๋ฌธ์„œ ๋กœ๋” import ์‹คํŒจ: {e}")
26
- logger.error("pymupdf ํŒจํ‚ค์ง€๋ฅผ ์„ค์น˜ํ•ด์ฃผ์„ธ์š”: pip install pymupdf")
27
- raise
28
- from langchain.text_splitter import RecursiveCharacterTextSplitter
29
- from langchain.schema import Document
30
-
31
- # OCR imports
32
- try:
33
- import easyocr
34
- EASYOCR_AVAILABLE = True
35
  except ImportError:
36
- EASYOCR_AVAILABLE = False
37
- easyocr = None
38
 
39
- # Image processing imports
40
  try:
41
  from PIL import Image, ImageEnhance
42
  PIL_AVAILABLE = True
@@ -47,48 +40,149 @@ except ImportError:
47
 
48
  logger = logging.getLogger(__name__)
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  class DocumentProcessor:
51
  """๋ฌธ์„œ ์ฒ˜๋ฆฌ ํด๋ž˜์Šค"""
52
 
53
  def __init__(self, formula_ocr_engine: str = 'easyocr'):
54
  """
 
 
55
  Args:
56
  formula_ocr_engine: ์ˆ˜์‹ ์ถ”์ถœ ์—”์ง„ ('easyocr', 'mathpix', 'latexocr')
57
  """
58
  self.formula_ocr_engine = formula_ocr_engine
59
- self.supported_formats = {
60
- '.pdf': 'pdf',
61
- '.docx': 'docx',
62
- '.doc': 'doc',
63
- '.pptx': 'pptx',
64
- '.ppt': 'ppt',
65
- '.txt': 'text'
66
- }
67
 
68
- # ํ…์ŠคํŠธ ๋ถ„ํ• ๊ธฐ ์„ค์ •
69
- self.text_splitter = RecursiveCharacterTextSplitter(
70
- chunk_size=1000,
71
- chunk_overlap=200,
72
- length_function=len,
73
- separators=["\n\n", "\n", " ", ""]
74
- )
75
 
76
- # OCR ๋ฆฌ๋” ์ดˆ๊ธฐํ™” (ํ•œ๊ตญ์–ด + ์˜์–ด + ์ˆ˜ํ•™ ๊ธฐํ˜ธ ์ตœ์ ํ™”)
77
- if not EASYOCR_AVAILABLE:
78
- logger.warning("โš ๏ธ EasyOCR์ด ์„ค์น˜๋˜์ง€ ์•Š์•„ OCR ๊ธฐ๋Šฅ์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
 
 
 
79
  self.ocr_reader = None
80
- else:
81
- try:
82
- # ๋งค์šฐ ๊ธฐ๋ณธ์ ์ธ ์„ค์ •์œผ๋กœ ๋‹จ์ˆœํ™”
83
- self.ocr_reader = easyocr.Reader(
84
- ['ko', 'en'],
85
- gpu=False,
86
- verbose=True, # ๋””๋ฒ„๊น…์„ ์œ„ํ•ด verbose ํ™œ์„ฑํ™”
87
- )
88
- logger.info("โœ… OCR ๋ฆฌ๋” ์ดˆ๊ธฐํ™” ์™„๋ฃŒ (๊ธฐ๋ณธ ์„ค์ •)")
89
- except Exception as e:
90
- logger.error(f"โŒ OCR ๋ฆฌ๋” ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}")
91
- self.ocr_reader = None
92
 
93
  # ์ˆ˜์‹ ์ถ”์ถœ ์—”์ง„ ์„ค์ • (LaTeX-OCR ๋น„ํ™œ์„ฑํ™”๋จ)
94
  if formula_ocr_engine in ['mathpix']: # 'latexocr' ์ œ๊ฑฐ
@@ -101,345 +195,57 @@ class DocumentProcessor:
101
  logger.warning(f"โš ๏ธ ์ˆ˜์‹ ์ถ”์ถœ ์—”์ง„ {formula_ocr_engine} ์‚ฌ์šฉ ๋ถˆ๊ฐ€, EasyOCR๋กœ ๋Œ€์ฒด")
102
  else:
103
  self.formula_extractor_available = False
104
-
105
- def get_file_type(self, file_path: str) -> Optional[str]:
106
- """ํŒŒ์ผ ํƒ€์ž… ํ™•์ธ"""
107
- try:
108
- # ํŒŒ์ผ ๊ฒฝ๋กœ์—์„œ ํ™•์žฅ์ž ์ถ”์ถœ
109
- file_path_str = str(file_path)
110
- # Path ๊ฐ์ฒด๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ™•์žฅ์ž ์ถ”์ถœ
111
- path_obj = Path(file_path_str)
112
- extension = path_obj.suffix.lower()
113
-
114
- if extension:
115
- # ๐Ÿ”„ ํ™•์žฅ์ž ์•ž์˜ ์ (.) ์ œ๊ฑฐํ•˜์—ฌ ๋ฐ˜ํ™˜
116
- file_type = extension[1:] if extension.startswith('.') else extension
117
- logger.info(f"๐Ÿ“„ ํŒŒ์ผ ํ™•์žฅ์ž ์ธ์‹: {extension} -> {file_type}")
118
- return file_type
119
- else:
120
- logger.warning(f"โš ๏ธ ํŒŒ์ผ ํ™•์žฅ์ž๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {file_path}")
121
- return None
122
- except Exception as e:
123
- logger.error(f"โŒ ํŒŒ์ผ ํƒ€์ž… ํ™•์ธ ์‹คํŒจ: {e}")
124
- return None
125
-
126
- def extract_text_from_image(self, image_data: bytes) -> str:
127
- """์ด๋ฏธ์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ (OCR) - ์ˆ˜ํ•™ ๊ธฐํ˜ธ ์ตœ์ ํ™”"""
128
- if not self.ocr_reader:
129
- logger.warning("โš ๏ธ OCR ๋ฆฌ๋”๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
130
- return ""
131
 
132
- try:
133
- import numpy as np
134
- import io
135
-
136
- # ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋ฅผ PIL Image๋กœ ๋ณ€ํ™˜
137
- pil_image = Image.open(io.BytesIO(image_data))
138
- logger.info(f"๐Ÿ“ธ ์ด๋ฏธ์ง€ ํฌ๊ธฐ: {pil_image.size}")
139
-
140
- # ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ (์ˆ˜ํ•™ ๊ธฐํ˜ธ ์ธ์‹ ๊ฐœ์„ )
141
- if PIL_AVAILABLE:
142
- pil_image = self.preprocess_image_for_math(pil_image)
143
- logger.info(f"๐Ÿ“ธ ์ „์ฒ˜๋ฆฌ ํ›„ ์ด๋ฏธ์ง€ ํฌ๊ธฐ: {pil_image.size}")
144
-
145
- # PIL Image๋ฅผ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
146
- img_array = np.array(pil_image)
147
- logger.info(f"๐Ÿ“ธ numpy ๋ฐฐ์—ด ํ˜•ํƒœ: {img_array.shape}")
148
-
149
- # OCR ์‹คํ–‰ (๊ธฐ๋ณธ ์„ค์ •์œผ๋กœ ๋‹จ์ˆœํ™”)
150
- logger.info("๐Ÿ” OCR ์‹คํ–‰ ์‹œ์ž‘...")
151
- results = self.ocr_reader.readtext(
152
- img_array,
153
- paragraph=True, # ๋‹จ๋ฝ ๋‹จ์œ„๋กœ ์ฒ˜๋ฆฌ
154
- )
155
- logger.info(f"๐Ÿ” OCR ๊ฒฐ๊ณผ ๊ฐœ์ˆ˜: {len(results)}")
156
-
157
- # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ๊ฒฐํ•ฉ (์‹ ๋ขฐ๋„ ๊ธฐ๋ฐ˜ ํ•„ํ„ฐ๋ง)
158
- extracted_text = ""
159
- for i, result in enumerate(results):
160
- try:
161
- # ๊ฒฐ๊ณผ ํ˜•์‹ ํ™•์ธ ๋ฐ ์•ˆ์ „ํ•œ ์ฒ˜๋ฆฌ
162
- if len(result) == 3:
163
- bbox, text, confidence = result
164
- elif len(result) == 2:
165
- bbox, text = result
166
- confidence = 0.5 # ๊ธฐ๋ณธ ์‹ ๋ขฐ๋„
167
- else:
168
- logger.warning(f"โš ๏ธ ์˜ˆ์ƒ์น˜ ๋ชปํ•œ OCR ๊ฒฐ๊ณผ ํ˜•์‹: {result}")
169
- continue
170
-
171
- logger.info(f"๐Ÿ” ๊ฒฐ๊ณผ {i+1}: '{text}' (์‹ ๋ขฐ๋„: {confidence:.2f})")
172
-
173
- if confidence > 0.3: # ์‹ ๋ขฐ๋„ ์ž„๊ณ„๊ฐ’์„ 30%๋กœ ๋‚ฎ์ถค
174
- # ์ˆ˜ํ•™ ๊ธฐํ˜ธ ํ›„์ฒ˜๋ฆฌ
175
- processed_text = self.post_process_math_symbols(text)
176
- extracted_text += processed_text + " "
177
- else:
178
- logger.info(f"โš ๏ธ ์‹ ๋ขฐ๋„ ๋‚ฎ์Œ ์ œ์™ธ: '{text}' (์‹ ๋ขฐ๋„: {confidence:.2f})")
179
-
180
- except Exception as e:
181
- logger.warning(f"โš ๏ธ OCR ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ ์‹คํŒจ (๊ฒฐ๊ณผ {i+1}): {e}")
182
- continue
183
-
184
- # LaTeX ์ˆ˜์‹ ํŒจํ„ด ๊ฐ์ง€ ๋ฐ ์ •๋ฆฌ
185
- latex_patterns = [
186
- r'\\[a-zA-Z]+', # LaTeX ๋ช…๋ น์–ด
187
- r'\\[a-zA-Z]+\{[^}]*\}', # LaTeX ๋ช…๋ น์–ด + ์ธ์ˆ˜
188
- r'\$[^$]+\$', # ์ธ๋ผ์ธ ์ˆ˜์‹
189
- r'\$\$[^$]+\$\$', # ๋ธ”๋ก ์ˆ˜์‹
190
- r'\\begin\{[^}]*\}.*?\\end\{[^}]*\}', # ํ™˜๊ฒฝ
191
- ]
192
-
193
- latex_text = ""
194
- for pattern in latex_patterns:
195
- matches = re.findall(pattern, extracted_text)
196
- if matches:
197
- latex_text += " ".join(matches) + "\n"
198
-
199
- # ์ตœ์ข… ํ…์ŠคํŠธ ๊ตฌ์„ฑ
200
- final_text = extracted_text.strip()
201
- if latex_text.strip():
202
- final_text += f"\n\n[LaTeX ์ˆ˜์‹ ๊ฐ์ง€]\n{latex_text.strip()}"
203
-
204
- logger.info(f"โœ… OCR ํ…์ŠคํŠธ ์ถ”์ถœ ์™„๋ฃŒ: {len(final_text)}์ž")
205
- if len(final_text) == 0:
206
- logger.warning("โš ๏ธ OCR์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.")
207
-
208
- return final_text
209
-
210
- except Exception as e:
211
- logger.error(f"โŒ OCR ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ: {e}")
212
- return ""
213
 
214
- def preprocess_image_for_math(self, image: Image.Image) -> Image.Image:
215
- """์ˆ˜ํ•™ ๊ธฐํ˜ธ ์ธ์‹์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ (๋‹จ์ˆœํ™”)"""
216
- if not PIL_AVAILABLE:
217
- logger.warning("โš ๏ธ PIL์ด ์„ค์น˜๋˜์ง€ ์•Š์•„ ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ๋ฅผ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
218
- return image
219
-
220
- try:
221
- # ๊ธฐ๋ณธ์ ์ธ ํฌ๊ธฐ ์กฐ์ •๋งŒ ์ˆ˜ํ–‰
222
- width, height = image.size
223
- logger.debug(f"๐Ÿ“ธ ์›๋ณธ ์ด๋ฏธ์ง€ ํฌ๊ธฐ: {width}x{height}")
224
-
225
- # ๋„ˆ๋ฌด ์ž‘์€ ์ด๋ฏธ์ง€๋งŒ ํ™•๋Œ€ (์•ˆ์ •์„ฑ ์šฐ์„ )
226
- if width < 1000 or height < 1000:
227
- scale_factor = max(1000 / width, 1000 / height)
228
- new_width = int(width * scale_factor)
229
- new_height = int(height * scale_factor)
230
- image = image.resize((new_width, new_height), Image.LANCZOS)
231
- logger.debug(f"๐Ÿ“ธ ํ™•๋Œ€ ํ›„ ์ด๋ฏธ์ง€ ํฌ๊ธฐ: {new_width}x{new_height}")
232
-
233
- # ๊ธฐ๋ณธ์ ์ธ ๋Œ€๋น„ ๊ฐœ์„ ๋งŒ ์ˆ˜ํ–‰
234
- try:
235
- enhancer = ImageEnhance.Contrast(image)
236
- image = enhancer.enhance(1.1) # ๋Œ€๋น„ 10% ์ฆ๊ฐ€ (๋‚ฎ์ถค)
237
- logger.debug("๐Ÿ“ธ ๋Œ€๋น„ ๊ฐœ์„  ์™„๋ฃŒ")
238
- except Exception as e:
239
- logger.debug(f"โš ๏ธ ๋Œ€๋น„ ๊ฐœ์„  ์‹คํŒจ: {e}")
240
-
241
- return image
242
-
243
- except Exception as e:
244
- logger.warning(f"โš ๏ธ ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
245
- return image
246
-
247
- def post_process_math_symbols(self, text: str) -> str:
248
- """์ˆ˜ํ•™ ๊ธฐํ˜ธ ํ›„์ฒ˜๋ฆฌ"""
249
- if not text:
250
- return text
251
-
252
- # ์ˆ˜ํ•™ ๊ธฐํ˜ธ ๋งคํ•‘ (OCR ์˜ค์ธ์‹ ๊ฐœ์„ )
253
- math_symbols = {
254
- '@': 'ร—', # ๊ณฑํ•˜๊ธฐ ๊ธฐํ˜ธ
255
- '4': '=', # ๋“ฑํ˜ธ
256
- '๊ตฌ': 'f', # ํ•จ์ˆ˜ f
257
- 'B': 'ฮฒ', # ๋ฒ ํƒ€
258
- 'A': 'ฮฑ', # ์•ŒํŒŒ
259
- 'C': 'ฮณ', # ๊ฐ๋งˆ
260
- 'D': 'ฮด', # ๋ธํƒ€
261
- 'E': 'ฮต', # ์—ก์‹ค๋ก 
262
- 'F': 'ฯ†', # ํŒŒ์ด
263
- 'G': 'ฮณ', # ๊ฐ๋งˆ
264
- 'H': 'ฮท', # ์—ํƒ€
265
- 'I': 'ฮน', # ์ด์˜คํƒ€
266
- 'K': 'ฮบ', # ์นดํŒŒ
267
- 'L': 'ฮป', # ๋žŒ๋‹ค
268
- 'M': 'ฮผ', # ๋ฎค
269
- 'N': 'ฮฝ', # ๋‰ด
270
- 'O': 'ฮฟ', # ์˜ค๋ฏธํฌ๋ก 
271
- 'P': 'ฯ€', # ํŒŒ์ด
272
- 'Q': 'ฮธ', # ์„ธํƒ€
273
- 'R': 'ฯ', # ๋กœ
274
- 'S': 'ฯƒ', # ์‹œ๊ทธ๋งˆ
275
- 'T': 'ฯ„', # ํƒ€์šฐ
276
- 'U': 'ฯ…', # ์›์‹ค๋ก 
277
- 'V': 'ฯ‰', # ์˜ค๋ฉ”๊ฐ€
278
- 'W': 'ฯˆ', # ํ”„์‚ฌ์ด
279
- 'X': 'ฯ‡', # ์นด์ด
280
- 'Y': 'ฯ…', # ์›์‹ค๋ก 
281
- 'Z': 'ฮถ', # ์ œํƒ€
282
- }
283
-
284
- # ์ˆ˜ํ•™ ๊ธฐํ˜ธ ๊ต์ฒด
285
- for wrong, correct in math_symbols.items():
286
- text = text.replace(wrong, correct)
287
-
288
- return text
289
-
290
- def post_process_ocr_text(self, text: str) -> str:
291
- """OCR ๊ฒฐ๊ณผ ํ›„์ฒ˜๋ฆฌ - ์ˆ˜ํ•™ ๊ธฐํ˜ธ ๊ฐœ์„ """
292
- if not text:
293
- return text
294
-
295
- # ์ˆ˜ํ•™ ๊ธฐํ˜ธ ๋งคํ•‘ (OCR ์˜ค์ธ์‹ ๊ฐœ์„ )
296
- math_symbols = {
297
- '@': 'ร—', # ๊ณฑํ•˜๊ธฐ ๊ธฐํ˜ธ
298
- '4': '=', # ๋“ฑํ˜ธ
299
- '๊ตฌ': 'f', # ํ•จ์ˆ˜ f
300
- 'B': 'ฮฒ', # ๋ฒ ํƒ€
301
- 'A': 'ฮฑ', # ์•ŒํŒŒ
302
- 'C': 'ฮณ', # ๊ฐ๋งˆ
303
- 'D': 'ฮด', # ๋ธํƒ€
304
- 'E': 'ฮต', # ์—ก์‹ค๋ก 
305
- 'F': 'ฯ†', # ํŒŒ์ด
306
- 'G': 'ฮณ', # ๊ฐ๋งˆ
307
- 'H': 'ฮท', # ์—ํƒ€
308
- 'I': 'ฮน', # ์ด์˜คํƒ€
309
- 'K': 'ฮบ', # ์นดํŒŒ
310
- 'L': 'ฮป', # ๋žŒ๋‹ค
311
- 'M': 'ฮผ', # ๋ฎค
312
- 'N': 'ฮฝ', # ๋‰ด
313
- 'O': 'ฮฟ', # ์˜ค๋ฏธํฌ๋ก 
314
- 'P': 'ฯ€', # ํŒŒ์ด
315
- 'Q': 'ฮธ', # ์„ธํƒ€
316
- 'R': 'ฯ', # ๋กœ
317
- 'S': 'ฯƒ', # ์‹œ๊ทธ๋งˆ
318
- 'T': 'ฯ„', # ํƒ€์šฐ
319
- 'U': 'ฯ…', # ์›์‹ค๋ก 
320
- 'V': 'ฯ‰', # ์˜ค๋ฉ”๊ฐ€
321
- 'W': 'ฯˆ', # ํ”„์‚ฌ์ด
322
- 'X': 'ฯ‡', # ์นด์ด
323
- 'Y': 'ฯ…', # ์›์‹ค๋ก 
324
- 'Z': 'ฮถ', # ์ œํƒ€
325
- '0': 'ฮธ', # ์„ธํƒ€ (์ˆซ์ž 0๊ณผ ํ˜ผ๋™)
326
- '1': 'ฮน', # ์ด์˜คํƒ€ (์ˆซ์ž 1๊ณผ ํ˜ผ๋™)
327
- '2': 'ฮถ', # ์ œํƒ€ (์ˆซ์ž 2์™€ ํ˜ผ๋™)
328
- '3': 'ฮพ', # ํฌ์‹œ (์ˆซ์ž 3๊ณผ ํ˜ผ๋™)
329
- '5': 'ฯ‚', # ์‹œ๊ทธ๋งˆ (์ˆซ์ž 5์™€ ํ˜ผ๋™)
330
- '6': 'ฯ‚', # ์‹œ๊ทธ๋งˆ (์ˆซ์ž 6๊ณผ ํ˜ผ๋™)
331
- '7': 'ฮท', # ์—ํƒ€ (์ˆซ์ž 7๊ณผ ํ˜ผ๋™)
332
- '8': 'ฮธ', # ์„ธํƒ€ (์ˆซ์ž 8๊ณผ ํ˜ผ๋™)
333
- '9': 'ฮถ', # ์ œํƒ€ (์ˆซ์ž 9์™€ ํ˜ผ๋™)
334
- }
335
-
336
- # ์ˆ˜ํ•™ ๊ธฐํ˜ธ ๊ต์ฒด
337
- for wrong, correct in math_symbols.items():
338
- text = text.replace(wrong, correct)
339
-
340
- # LaTeX ์ˆ˜์‹ ํŒจํ„ด ๊ฐ์ง€ ๋ฐ ๊ฐœ์„ 
341
- latex_patterns = [
342
- (r'f\s*\(\s*([^)]+)\s*\)', r'f(\1)'), # ํ•จ์ˆ˜ ํ‘œ๊ธฐ ์ •๋ฆฌ
343
- (r'lim\s*([^โ†’]+)โ†’([^=]+)=', r'\\lim_{\1 \\to \2} ='), # ๊ทนํ•œ ํ‘œ๊ธฐ
344
- (r'โˆซ\s*([^d]+)d([^=]+)', r'\\int \1 d\2'), # ์ ๋ถ„ ํ‘œ๊ธฐ
345
- (r'โˆ‘\s*([^=]+)=', r'\\sum \1 ='), # ํ•ฉ ํ‘œ๊ธฐ
346
- (r'ฯ€', r'\\pi'), # ํŒŒ์ด
347
- (r'โˆž', r'\\infty'), # ๋ฌดํ•œ๋Œ€
348
- (r'โˆš([^=]+)', r'\\sqrt{\1}'), # ์ œ๊ณฑ๊ทผ
349
- (r'([0-9]+)\^([0-9]+)', r'\1^{\\2}'), # ์ง€์ˆ˜
350
- (r'([0-9]+)/([0-9]+)', r'\\frac{\1}{\2}'), # ๋ถ„์ˆ˜
351
- ]
352
-
353
- for pattern, replacement in latex_patterns:
354
- text = re.sub(pattern, replacement, text)
355
-
356
- return text
357
 
358
  def load_document(self, file_path: str) -> List[Document]:
359
- """๋ฌธ์„œ ๋กœ๋“œ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ"""
360
- file_type = self.get_file_type(file_path)
361
- if not file_type:
362
- raise ValueError(f"์ง€์›ํ•˜์ง€ ์•Š๋Š” ํŒŒ์ผ ํ˜•์‹: {file_path}")
363
-
364
- documents = []
365
-
366
  try:
 
 
367
  if file_type == 'pdf':
368
- # PDF ์ฒ˜๋ฆฌ: ํ…์ŠคํŠธ ์ง์ ‘ ์ถ”์ถœ ๋ฐฉ์‹์œผ๋กœ ๋ณ€๊ฒฝ
369
- import fitz # PyMuPDF
370
-
371
- doc = fitz.open(file_path)
372
- logger.info(f"๐Ÿ“„ PDF ๋ฌธ์„œ ๋กœ๋“œ: {len(doc)} ํŽ˜์ด์ง€")
373
-
374
- for page_num in range(len(doc)):
375
- page = doc.load_page(page_num)
376
-
377
- # ํ…์ŠคํŠธ ์ง์ ‘ ์ถ”์ถœ (OCR ๋Œ€์‹ )
378
- page_text = page.get_text()
379
-
380
- # ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ๊ฑฐ๋‚˜ ๋„ˆ๋ฌด ์งง์€ ๊ฒฝ์šฐ์—๋งŒ OCR ์‚ฌ์šฉ
381
- if not page_text.strip() or len(page_text.strip()) < 50:
382
- logger.info(f"โš ๏ธ ํŽ˜์ด์ง€ {page_num + 1} ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ, OCR ์‚ฌ์šฉ")
383
- pix = page.get_pixmap(dpi=300)
384
- img_data = pix.tobytes("png")
385
- page_text = self.extract_text_from_image(img_data)
386
- page_text = self.post_process_ocr_text(page_text)
387
-
388
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์„ค์ •
389
- metadata = {
390
- 'source': file_path,
391
- 'page': page_num + 1,
392
- 'file_type': 'pdf',
393
- 'processing_method': 'text_extraction' if page_text.strip() else 'ocr'
394
- }
395
-
396
- # Document ๊ฐ์ฒด ์ƒ์„ฑ
397
- documents.append(Document(
398
- page_content=page_text,
399
- metadata=metadata
400
- ))
401
-
402
- logger.info(f"โœ… ํŽ˜์ด์ง€ {page_num + 1} ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {len(page_text)} ๋ฌธ์ž")
403
-
404
- doc.close()
405
- logger.info(f"โœ… PDF ๋ฌธ์„œ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)}๊ฐœ ํŽ˜์ด์ง€")
406
- return documents
407
-
408
  elif file_type == 'docx':
409
  loader = UnstructuredWordDocumentLoader(file_path)
410
- documents = loader.load()
411
- logger.info(f"โœ… DOCX ๋ฌธ์„œ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)}๊ฐœ ์ฒญํฌ")
412
-
413
  elif file_type == 'pptx':
414
  loader = UnstructuredPowerPointLoader(file_path)
415
- documents = loader.load()
416
- logger.info(f"โœ… PPTX ๋ฌธ์„œ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)}๊ฐœ ์ฒญํฌ")
417
-
418
- elif file_type == 'text':
419
- loader = UnstructuredFileLoader(file_path)
420
- documents = loader.load()
421
- logger.info(f"โœ… ํ…์ŠคํŠธ ๋ฌธ์„œ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)}๊ฐœ ์ฒญํฌ")
422
-
423
  else:
424
- raise ValueError(f"์ง€์›ํ•˜์ง€ ์•Š๋Š” ํŒŒ์ผ ํ˜•์‹: {file_type}")
425
-
 
 
 
 
 
426
  except Exception as e:
427
  logger.error(f"โŒ ๋ฌธ์„œ ๋กœ๋“œ ์‹คํŒจ: {e}")
428
- raise
429
-
430
- return documents
431
 
432
- def split_documents(self, documents: List[Document]) -> List[Document]:
433
  """๋ฌธ์„œ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• """
434
- logger.info(f"๐Ÿ“ ๋ฌธ์„œ ๋ถ„ํ•  ์ค‘: {len(documents)}๊ฐœ ๋ฌธ์„œ")
435
-
436
  try:
437
- split_docs = self.text_splitter.split_documents(documents)
438
- logger.info(f"โœ… ๋ฌธ์„œ ๋ถ„ํ•  ์™„๋ฃŒ: {len(split_docs)}๊ฐœ ์ฒญํฌ")
 
 
 
 
 
 
 
 
 
439
  return split_docs
 
440
  except Exception as e:
441
  logger.error(f"โŒ ๋ฌธ์„œ ๋ถ„ํ•  ์‹คํŒจ: {e}")
442
- raise
443
 
444
  def process_document(self, file_path: str) -> List[Document]:
445
  """
@@ -457,8 +263,12 @@ class DocumentProcessor:
457
  file_type = self.get_file_type(file_path)
458
 
459
  if file_type == 'pdf':
460
- # PDF๋Š” ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๋ฐฉ์‹์œผ๋กœ ์ฒ˜๋ฆฌ
461
- return self._process_pdf_hybrid(file_path)
 
 
 
 
462
  else:
463
  # ๋‹ค๋ฅธ ๋ฌธ์„œ ํ˜•์‹์€ ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ์ฒ˜๋ฆฌ
464
  documents = self.load_document(file_path)
@@ -488,156 +298,569 @@ class DocumentProcessor:
488
  except Exception as e:
489
  logger.error(f"โŒ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
490
  return []
491
-
492
  def _process_pdf_hybrid(self, pdf_path: str) -> List[Document]:
493
  """
494
- PDF ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ฒ˜๋ฆฌ (ํ…์ŠคํŠธ + ์ด๋ฏธ์ง€)
495
- Cursor AI ๋ฐฉ์‹: ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ + ์ด๋ฏธ์ง€ ํฌํ•จ ํŽ˜์ด์ง€๋Š” ์ด๋ฏธ์ง€๋„ ํ•จ๊ป˜ ์ฒ˜๋ฆฌ
496
-
497
- Args:
498
- pdf_path: PDF ํŒŒ์ผ ๊ฒฝ๋กœ
499
-
500
- Returns:
501
- List[Document]: ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ ์ฒญํฌ๋“ค
502
  """
503
  try:
504
  import fitz # PyMuPDF
505
 
506
  doc = fitz.open(pdf_path)
507
- processed_docs = []
 
 
 
508
 
509
- logger.info(f"๐Ÿ“„ PDF ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ฒ˜๋ฆฌ ์‹œ์ž‘: {len(doc)}ํŽ˜์ด์ง€")
510
 
511
  for page_num in range(len(doc)):
512
- page = doc.load_page(page_num)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
 
514
- # 1. ํ…์ŠคํŠธ ์ถ”์ถœ ์‹œ๋„
515
- text_content = page.get_text()
 
 
 
 
 
 
 
 
 
 
 
 
516
 
517
- # 2. ์ด๋ฏธ์ง€ ์ถ”์ถœ
518
- image_list = page.get_images()
519
- page_images = []
520
 
521
- for img_index, img in enumerate(image_list):
522
- try:
523
- xref = img[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  pix = fitz.Pixmap(doc, xref)
525
-
526
- if pix.n - pix.alpha < 4: # CMYK ์ฒ˜๋ฆฌ
 
 
 
 
 
527
  pix = fitz.Pixmap(fitz.csRGB, pix)
 
 
 
 
 
 
 
528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  img_data = pix.tobytes("png")
530
- img_pil = Image.open(io.BytesIO(img_data))
531
 
532
- if self._is_valid_image(img_pil):
533
- # ์ด๋ฏธ์ง€๋ฅผ Base64๋กœ ์ธ์ฝ”๋”ฉ
534
- img_buffer = io.BytesIO()
535
- img_pil.save(img_buffer, format='PNG')
536
- img_base64 = base64.b64encode(img_buffer.getvalue()).decode()
537
- image_url = f"data:image/png;base64,{img_base64}"
538
-
539
- page_images.append({
540
- "index": img_index,
541
- "image_url": image_url,
542
- "size": img_pil.size
543
- })
544
 
545
- pix = None
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
- except Exception as e:
548
- logger.warning(f"โš ๏ธ ์ด๋ฏธ์ง€ ์ถ”์ถœ ์‹คํŒจ (ํŽ˜์ด์ง€ {page_num + 1}, ์ธ๋ฑ์Šค {img_index}): {e}")
549
- continue
550
 
551
- # 3. ํŽ˜์ด์ง€๋ณ„ ๋ฌธ์„œ ์ƒ์„ฑ
552
- if text_content.strip() or page_images:
553
- # ํ…์ŠคํŠธ๊ฐ€ ์žˆ๊ฑฐ๋‚˜ ์ด๋ฏธ์ง€๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ
554
- content_parts = []
555
-
556
- if text_content.strip():
557
- content_parts.append(f"[ํŽ˜์ด์ง€ {page_num + 1}] {text_content.strip()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
559
- if page_images:
560
- content_parts.append(f"[์ด๋ฏธ์ง€ {len(page_images)}๊ฐœ ํฌํ•จ]")
 
561
 
562
- page_content = "\n\n".join(content_parts)
 
 
 
563
 
564
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ
565
- metadata = {
566
- "source": pdf_path,
567
- "page": page_num + 1,
568
- "total_pages": len(doc),
569
- "has_text": bool(text_content.strip()),
570
- "has_images": bool(page_images),
571
- "image_count": len(page_images)
572
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
- # ์ด๋ฏธ์ง€๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์ด๋ฏธ์ง€ URL๋“ค ์ถ”๊ฐ€
575
- if page_images:
576
- metadata["image_urls"] = [img["image_url"] for img in page_images]
577
- metadata["image_sizes"] = [img["size"] for img in page_images]
 
 
 
 
 
 
 
 
578
 
579
- doc_chunk = Document(
580
- page_content=page_content,
581
- metadata=metadata
582
- )
583
- processed_docs.append(doc_chunk)
 
584
 
585
- logger.info(f"โœ… ํŽ˜์ด์ง€ {page_num + 1} ์ฒ˜๋ฆฌ ์™„๋ฃŒ: ํ…์ŠคํŠธ={bool(text_content.strip())}, ์ด๋ฏธ์ง€={len(page_images)}๊ฐœ")
586
- else:
587
- # ๋นˆ ํŽ˜์ด์ง€๋Š” ๊ฑด๋„ˆ๋›ฐ๊ธฐ
588
- logger.info(f"โญ๏ธ ํŽ˜์ด์ง€ {page_num + 1} ๊ฑด๋„ˆ๋›ฐ๊ธฐ (๋นˆ ํŽ˜์ด์ง€)")
589
-
590
- doc.close()
591
-
592
- if not processed_docs:
593
- logger.warning("โš ๏ธ ์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ํ”Œ๋ ˆ์ด์Šคํ™€๋”๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.")
594
- placeholder_doc = Document(
595
- page_content="PDF ๋ฌธ์„œ์—์„œ ๋‚ด์šฉ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.",
596
- metadata={"source": pdf_path, "page": 0}
597
- )
598
- processed_docs = [placeholder_doc]
599
-
600
- logger.info(f"โœ… PDF ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {len(processed_docs)}๊ฐœ ์ฒญํฌ")
601
- return processed_docs
602
-
603
- except Exception as e:
604
- logger.error(f"โŒ PDF ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
605
- # ์‹คํŒจ ์‹œ ๊ธฐ์กด ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ์ฒ˜๋ฆฌ๋กœ ํด๋ฐฑ
606
- logger.info("๐Ÿ”„ ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ์ฒ˜๋ฆฌ๋กœ ํด๋ฐฑํ•ฉ๋‹ˆ๋‹ค.")
607
- return self.load_document(pdf_path)
608
-
609
  def _is_valid_image(self, img: Image.Image) -> bool:
610
  """์ด๋ฏธ์ง€ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ"""
611
  try:
 
 
612
  # ์ตœ์†Œ/์ตœ๋Œ€ ํฌ๊ธฐ ํ™•์ธ
613
  if img.size[0] < self.min_image_size[0] or img.size[1] < self.min_image_size[1]:
 
614
  return False
615
  if img.size[0] > self.max_image_size[0] or img.size[1] > self.max_image_size[1]:
 
616
  return False
617
 
618
  # ์ด๋ฏธ์ง€ ๋ชจ๋“œ ํ™•์ธ
619
  if img.mode not in ['RGB', 'RGBA', 'L']:
 
620
  return False
621
 
 
622
  return True
623
- except Exception:
 
624
  return False
625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  def _extract_formulas_from_documents(self, documents: List[Document]) -> List[Document]:
627
- """
628
- ๋ฌธ์„œ์—์„œ ์ˆ˜์‹์„ ์ถ”์ถœํ•˜์—ฌ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์— ์ถ”๊ฐ€
629
- """
630
- # ๊ธฐ์กด ๊ตฌํ˜„ ์œ ์ง€
631
  return documents
632
 
633
  def get_document_info(self, file_path: str) -> Dict[str, Any]:
634
- """๋ฌธ์„œ ์ •๋ณด ๋ฐ˜ํ™˜"""
635
  try:
636
- documents = self.load_document(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
637
 
 
638
  total_text = ""
639
  for doc in documents:
640
- total_text += doc.page_content + "\n"
 
641
 
642
  return {
643
  'file_path': file_path,
@@ -659,4 +882,4 @@ class DocumentProcessor:
659
  document_processor = DocumentProcessor(formula_ocr_engine='latexocr')
660
  # ํ•„์š”์‹œ ๋‹ค๋ฅธ ์—”์ง„์œผ๋กœ ๋ณ€๊ฒฝ ๊ฐ€๋Šฅ:
661
  # document_processor = DocumentProcessor(formula_ocr_engine='easyocr') # EasyOCR ์‚ฌ์šฉ
662
- # document_processor = DocumentProcessor(formula_ocr_engine='mathpix') # MathPix API ์‚ฌ์šฉ
 
2
  """
3
  ๋ฌธ์„œ ์ฒ˜๋ฆฌ ๋ชจ๋“ˆ
4
  PDF, DOCX, PPTX ๋“ฑ ๋‹ค์–‘ํ•œ ๋ฌธ์„œ ํ˜•์‹์„ ์ฒ˜๋ฆฌ
5
+ ์‹ค๋ฌด ์ˆ˜์ค€ PDF ๊ตฌ์กฐ ๋ถ„์„ + ๊ณต๊ฐ„์  ๊ด€๊ณ„ ๋งคํ•‘ ์ง€์›
6
  """
7
 
8
  import os
9
  import logging
10
+ from typing import List, Dict, Any, Optional, Tuple, NamedTuple
11
  from pathlib import Path
12
  import easyocr
13
  import re
14
  import base64
15
  import io
16
+ import json
17
+ from dataclasses import dataclass, field
18
 
19
  # LangChain ๋ฌธ์„œ ๋กœ๋”๋“ค
20
  try:
21
  from langchain_community.document_loaders import (
22
  PyMuPDFLoader,
23
  UnstructuredWordDocumentLoader,
24
+ UnstructuredPowerPointLoader
 
25
  )
26
+ from langchain.schema import Document
27
+ LANGCHAIN_AVAILABLE = True
 
 
 
 
 
 
 
 
 
28
  except ImportError:
29
+ LANGCHAIN_AVAILABLE = False
30
+ Document = None
31
 
32
+ # PIL (Pillow) ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ
33
  try:
34
  from PIL import Image, ImageEnhance
35
  PIL_AVAILABLE = True
 
40
 
41
  logger = logging.getLogger(__name__)
42
 
43
+ # ์‹ค๋ฌด ์ˆ˜์ค€ PDF ๊ตฌ์กฐ ๋ถ„์„์„ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ํด๋ž˜์Šค๋“ค
44
+ @dataclass
45
+ class BoundingBox:
46
+ """๋ฐ”์šด๋”ฉ ๋ฐ•์Šค (x0, y0, x1, y1)"""
47
+ x0: float
48
+ y0: float
49
+ x1: float
50
+ y1: float
51
+
52
+ @property
53
+ def width(self) -> float:
54
+ return self.x1 - self.x0
55
+
56
+ @property
57
+ def height(self) -> float:
58
+ return self.y1 - self.y0
59
+
60
+ @property
61
+ def center(self) -> Tuple[float, float]:
62
+ return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
63
+
64
+ def overlaps_with(self, other: 'BoundingBox') -> bool:
65
+ """๋‹ค๋ฅธ ๋ฐ•์Šค์™€ ๊ฒน์น˜๋Š”์ง€ ํ™•์ธ"""
66
+ return not (self.x1 < other.x0 or other.x1 < self.x0 or
67
+ self.y1 < other.y0 or other.y1 < self.y0)
68
+
69
+ def distance_to(self, other: 'BoundingBox') -> float:
70
+ """๋‹ค๋ฅธ ๋ฐ•์Šค์™€์˜ ๊ฑฐ๋ฆฌ (์ค‘์‹ฌ์  ๊ธฐ์ค€)"""
71
+ cx1, cy1 = self.center
72
+ cx2, cy2 = other.center
73
+ return ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
74
+
75
+ @dataclass
76
+ class PDFBlock:
77
+ """PDF์˜ ํ…์ŠคํŠธ/์ด๋ฏธ์ง€ ๋ธ”๋ก"""
78
+ block_id: str
79
+ block_type: str # 'text', 'image', 'table', 'figure'
80
+ bbox: BoundingBox
81
+ content: Any
82
+ page_num: int
83
+ metadata: Dict[str, Any] = field(default_factory=dict)
84
+
85
+ def is_near(self, other: 'PDFBlock', threshold: float = 50.0) -> bool:
86
+ """๋‹ค๋ฅธ ๋ธ”๋ก๊ณผ ๊ฐ€๊นŒ์šด์ง€ ํ™•์ธ (ํ”ฝ์…€ ๋‹จ์œ„)"""
87
+ return self.bbox.distance_to(other.bbox) <= threshold
88
+
89
+ def is_above(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
90
+ """๋‹ค๋ฅธ ๋ธ”๋ก ์œ„์— ์žˆ๋Š”์ง€ ํ™•์ธ"""
91
+ return self.bbox.y1 <= other.bbox.y0 + threshold
92
+
93
+ def is_below(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
94
+ """๋‹ค๋ฅธ ๋ธ”๋ก ์•„๋ž˜์— ์žˆ๋Š”์ง€ ํ™•์ธ"""
95
+ return self.bbox.y0 >= other.bbox.y1 - threshold
96
+
97
+ def is_left_of(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
98
+ """๋‹ค๋ฅธ ๋ธ”๋ก ์™ผ์ชฝ์— ์žˆ๋Š”์ง€ ํ™•์ธ"""
99
+ return self.bbox.x1 <= other.bbox.x0 + threshold
100
+
101
+ def is_right_of(self, other: 'PDFBlock', threshold: float = 10.0) -> bool:
102
+ """๋‹ค๋ฅธ ๋ธ”๋ก ์˜ค๋ฅธ์ชฝ์— ์žˆ๋Š”์ง€ ํ™•์ธ"""
103
+ return self.bbox.x0 >= other.bbox.x1 - threshold
104
+
105
+ @dataclass
106
+ class PDFPage:
107
+ """PDF ํŽ˜์ด์ง€ ๊ตฌ์กฐ"""
108
+ page_num: int
109
+ width: float
110
+ height: float
111
+ blocks: List[PDFBlock] = field(default_factory=list)
112
+
113
+ def get_blocks_by_type(self, block_type: str) -> List[PDFBlock]:
114
+ """ํŠน์ • ํƒ€์ž…์˜ ๋ธ”๋ก๋“ค ๋ฐ˜ํ™˜"""
115
+ return [block for block in self.blocks if block.block_type == block_type]
116
+
117
+ def find_related_blocks(self, target_block: PDFBlock,
118
+ relation_types: List[str] = None) -> List[Tuple[PDFBlock, str]]:
119
+ """๊ด€๋ จ๋œ ๋ธ”๋ก๋“ค๊ณผ ๊ด€๊ณ„ ํƒ€์ž… ๋ฐ˜ํ™˜"""
120
+ if relation_types is None:
121
+ relation_types = ['near', 'above', 'below', 'left', 'right']
122
+
123
+ related = []
124
+ for block in self.blocks:
125
+ if block.block_id == target_block.block_id:
126
+ continue
127
+
128
+ for relation in relation_types:
129
+ if relation == 'near' and target_block.is_near(block):
130
+ related.append((block, 'near'))
131
+ elif relation == 'above' and target_block.is_above(block):
132
+ related.append((block, 'above'))
133
+ elif relation == 'below' and target_block.is_below(block):
134
+ related.append((block, 'below'))
135
+ elif relation == 'left' and target_block.is_left_of(block):
136
+ related.append((block, 'left'))
137
+ elif relation == 'right' and target_block.is_right_of(block):
138
+ related.append((block, 'right'))
139
+
140
+ return related
141
+
142
+ @dataclass
143
+ class PDFStructure:
144
+ """PDF ์ „์ฒด ๊ตฌ์กฐ"""
145
+ pages: List[PDFPage] = field(default_factory=list)
146
+ metadata: Dict[str, Any] = field(default_factory=dict)
147
+
148
+ def get_all_blocks(self) -> List[PDFBlock]:
149
+ """๋ชจ๋“  ๋ธ”๋ก ๋ฐ˜ํ™˜"""
150
+ all_blocks = []
151
+ for page in self.pages:
152
+ all_blocks.extend(page.blocks)
153
+ return all_blocks
154
+
155
+ def get_blocks_by_type(self, block_type: str) -> List[PDFBlock]:
156
+ """์ „์ฒด ๋ฌธ์„œ์—์„œ ํŠน์ • ํƒ€์ž…์˜ ๋ธ”๋ก๋“ค ๋ฐ˜ํ™˜"""
157
+ blocks = []
158
+ for page in self.pages:
159
+ blocks.extend(page.get_blocks_by_type(block_type))
160
+ return blocks
161
+
162
  class DocumentProcessor:
163
  """๋ฌธ์„œ ์ฒ˜๋ฆฌ ํด๋ž˜์Šค"""
164
 
165
  def __init__(self, formula_ocr_engine: str = 'easyocr'):
166
  """
167
+ ๋ฌธ์„œ ์ฒ˜๋ฆฌ๊ธฐ ์ดˆ๊ธฐํ™”
168
+
169
  Args:
170
  formula_ocr_engine: ์ˆ˜์‹ ์ถ”์ถœ ์—”์ง„ ('easyocr', 'mathpix', 'latexocr')
171
  """
172
  self.formula_ocr_engine = formula_ocr_engine
173
+ self.formula_extractor_available = False
 
 
 
 
 
 
 
174
 
175
+ # ์ด๋ฏธ์ง€ ํฌ๊ธฐ ์ œํ•œ ์„ค์ •
176
+ self.min_image_size = (10, 10) # ์ตœ์†Œ ์ด๋ฏธ์ง€ ํฌ๊ธฐ (๋„ˆ๋ฌด ์ž‘์€ ์ด๋ฏธ์ง€ ์ œ์™ธ)
177
+ self.max_image_size = (10000, 10000) # ์ตœ๋Œ€ ์ด๋ฏธ์ง€ ํฌ๊ธฐ (๋„ˆ๋ฌด ํฐ ์ด๋ฏธ์ง€ ์ œ์™ธ)
 
 
 
 
178
 
179
+ # OCR ์—”์ง„ ์ดˆ๊ธฐํ™”
180
+ try:
181
+ self.ocr_reader = easyocr.Reader(['ko', 'en'], gpu=False)
182
+ logger.info("โœ… EasyOCR ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
183
+ except Exception as e:
184
+ logger.warning(f"โš ๏ธ EasyOCR ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}")
185
  self.ocr_reader = None
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  # ์ˆ˜์‹ ์ถ”์ถœ ์—”์ง„ ์„ค์ • (LaTeX-OCR ๋น„ํ™œ์„ฑํ™”๋จ)
188
  if formula_ocr_engine in ['mathpix']: # 'latexocr' ์ œ๊ฑฐ
 
195
  logger.warning(f"โš ๏ธ ์ˆ˜์‹ ์ถ”์ถœ ์—”์ง„ {formula_ocr_engine} ์‚ฌ์šฉ ๋ถˆ๊ฐ€, EasyOCR๋กœ ๋Œ€์ฒด")
196
  else:
197
  self.formula_extractor_available = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ logger.info(f"๐Ÿš€ DocumentProcessor ์ดˆ๊ธฐํ™” ์™„๋ฃŒ (OCR: {'EasyOCR' if self.ocr_reader else 'None'}, ์ˆ˜์‹: {formula_ocr_engine})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ def get_file_type(self, file_path: str) -> str:
202
+ """ํŒŒ์ผ ํ™•์žฅ์ž ๊ธฐ๋ฐ˜ ํŒŒ์ผ ํƒ€์ž… ๋ฐ˜ํ™˜"""
203
+ ext = Path(file_path).suffix.lower()
204
+ # ํ™•์žฅ์ž์—์„œ ์  ์ œ๊ฑฐ
205
+ return ext[1:] if ext.startswith('.') else ext
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def load_document(self, file_path: str) -> List[Document]:
208
+ """๋ฌธ์„œ ๋กœ๋“œ (๊ธฐ๋ณธ ๋ฐฉ์‹)"""
 
 
 
 
 
 
209
  try:
210
+ file_type = self.get_file_type(file_path)
211
+
212
  if file_type == 'pdf':
213
+ loader = PyMuPDFLoader(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  elif file_type == 'docx':
215
  loader = UnstructuredWordDocumentLoader(file_path)
 
 
 
216
  elif file_type == 'pptx':
217
  loader = UnstructuredPowerPointLoader(file_path)
 
 
 
 
 
 
 
 
218
  else:
219
+ logger.warning(f"โš ๏ธ ์ง€์›ํ•˜์ง€ ์•Š๋Š” ํŒŒ์ผ ํ˜•์‹: {file_type}")
220
+ return []
221
+
222
+ documents = loader.load()
223
+ logger.info(f"๐Ÿ“„ ๋ฌธ์„œ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)}๊ฐœ ์ฒญํฌ")
224
+ return documents
225
+
226
  except Exception as e:
227
  logger.error(f"โŒ ๋ฌธ์„œ ๋กœ๋“œ ์‹คํŒจ: {e}")
228
+ return []
 
 
229
 
230
+ def split_documents(self, documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
231
  """๋ฌธ์„œ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• """
 
 
232
  try:
233
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
234
+
235
+ text_splitter = RecursiveCharacterTextSplitter(
236
+ chunk_size=chunk_size,
237
+ chunk_overlap=chunk_overlap,
238
+ length_function=len,
239
+ separators=["\n\n", "\n", " ", ""]
240
+ )
241
+
242
+ split_docs = text_splitter.split_documents(documents)
243
+ logger.info(f"๐Ÿ“„ ๋ฌธ์„œ ๋ถ„ํ•  ์™„๋ฃŒ: {len(documents)}๊ฐœ โ†’ {len(split_docs)}๊ฐœ ์ฒญํฌ")
244
  return split_docs
245
+
246
  except Exception as e:
247
  logger.error(f"โŒ ๋ฌธ์„œ ๋ถ„ํ•  ์‹คํŒจ: {e}")
248
+ return documents
249
 
250
  def process_document(self, file_path: str) -> List[Document]:
251
  """
 
263
  file_type = self.get_file_type(file_path)
264
 
265
  if file_type == 'pdf':
266
+ # PDF๋Š” ์‹ค๋ฌด ์ˆ˜์ค€ ๊ตฌ์กฐ ๋ถ„์„์œผ๋กœ ์ฒ˜๋ฆฌ
267
+ pdf_structure = self._process_pdf_with_structure_analysis(file_path)
268
+ # PDFStructure๋ฅผ Document ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
269
+ documents = self._convert_structure_to_documents(pdf_structure)
270
+ logger.info(f"โœ… ์‹ค๋ฌด ์ˆ˜๏ฟฝ๏ฟฝ๏ฟฝ PDF ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {len(documents)}๊ฐœ Document ์ƒ์„ฑ")
271
+ return documents
272
  else:
273
  # ๋‹ค๋ฅธ ๋ฌธ์„œ ํ˜•์‹์€ ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ์ฒ˜๋ฆฌ
274
  documents = self.load_document(file_path)
 
298
  except Exception as e:
299
  logger.error(f"โŒ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
300
  return []
301
+
302
  def _process_pdf_hybrid(self, pdf_path: str) -> List[Document]:
303
  """
304
+ ์‹ค๋ฌด ์ˆ˜์ค€ PDF ์ฒ˜๋ฆฌ (๊ตฌ์กฐ ๋ถ„์„ + ๊ณต๊ฐ„์  ๊ด€๊ณ„ ๋งคํ•‘)
305
+ """
306
+ return self._process_pdf_with_structure_analysis(pdf_path)
307
+
308
+ def _process_pdf_with_structure_analysis(self, pdf_path: str) -> List[Document]:
309
+ """
310
+ ์‹ค๋ฌด ์ˆ˜์ค€ PDF ์ฒ˜๋ฆฌ (๊ตฌ์กฐ ๋ถ„์„ + ๊ณต๊ฐ„์  ๊ด€๊ณ„ ๋งคํ•‘)
 
311
  """
312
  try:
313
  import fitz # PyMuPDF
314
 
315
  doc = fitz.open(pdf_path)
316
+ pdf_structure = PDFStructure()
317
+
318
+ # PDF ๊ฒฝ๋กœ ์ •๋ณด๋ฅผ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์— ์ถ”๊ฐ€
319
+ pdf_structure.metadata["source"] = pdf_path
320
 
321
+ logger.info(f"๐Ÿ” PDF ๊ตฌ์กฐ ๋ถ„์„ ์‹œ์ž‘: {len(doc)}ํŽ˜์ด์ง€")
322
 
323
  for page_num in range(len(doc)):
324
+ page = doc[page_num]
325
+ page_rect = page.rect
326
+
327
+ pdf_page = PDFPage(
328
+ page_num=page_num + 1,
329
+ width=page_rect.width,
330
+ height=page_rect.height
331
+ )
332
+
333
+ # 1. ํ…์ŠคํŠธ ๋ธ”๋ก ์ถ”์ถœ (์œ„์น˜ ์ •๋ณด ํฌํ•จ)
334
+ text_blocks = self._extract_text_blocks(page, page_num)
335
+ pdf_page.blocks.extend(text_blocks)
336
+
337
+ # 2. ์ด๋ฏธ์ง€ ๋ธ”๋ก ์ถ”์ถœ (์œ„์น˜ ์ •๋ณด ํฌํ•จ)
338
+ image_blocks = self._extract_image_blocks(page, page_num, doc)
339
+ pdf_page.blocks.extend(image_blocks)
340
+
341
+ pdf_structure.pages.append(pdf_page)
342
+
343
+ logger.info(f"๐Ÿ“„ ํŽ˜์ด์ง€ {page_num + 1} ๋ถ„์„ ์™„๋ฃŒ: "
344
+ f"ํ…์ŠคํŠธ ๋ธ”๋ก {len(text_blocks)}๊ฐœ, "
345
+ f"์ด๋ฏธ์ง€ ๋ธ”๋ก {len(image_blocks)}๊ฐœ")
346
+
347
+ doc.close()
348
+
349
+ # 3. ๋ธ”๋ก ๊ฐ„ ๊ด€๊ณ„ ๋ถ„์„
350
+ self._analyze_block_relationships(pdf_structure)
351
+
352
+ logger.info(f"โœ… PDF ๊ตฌ์กฐ ๋ถ„์„ ์™„๋ฃŒ: ์ด {len(pdf_structure.get_all_blocks())}๊ฐœ ๋ธ”๋ก")
353
+ return pdf_structure
354
+
355
+ except Exception as e:
356
+ logger.error(f"โŒ PDF ๊ตฌ์กฐ ๋ถ„์„ ์‹คํŒจ: {e}")
357
+ import traceback
358
+ traceback.print_exc()
359
+ return PDFStructure()
360
+
361
+ def _extract_text_blocks(self, page, page_num: int) -> List[PDFBlock]:
362
+ """ํŽ˜์ด์ง€์—์„œ ํ…์ŠคํŠธ ๋ธ”๋ก ์ถ”์ถœ"""
363
+ text_blocks = []
364
+
365
+ try:
366
+ # PyMuPDF์˜ get_text("dict") ์‚ฌ์šฉ - ๊ฐ€์žฅ ์ •ํ™•ํ•œ ์œ„์น˜ ์ •๋ณด ์ œ๊ณต
367
+ text_dict = page.get_text("dict")
368
+
369
+ for block_idx, block in enumerate(text_dict["blocks"]):
370
+ if "lines" not in block: # ์ด๋ฏธ์ง€ ๋ธ”๋ก์€ ๊ฑด๋„ˆ๋›ฐ๊ธฐ
371
+ continue
372
 
373
+ # ๋ธ”๋ก์˜ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค
374
+ bbox = BoundingBox(
375
+ x0=block["bbox"][0],
376
+ y0=block["bbox"][1],
377
+ x1=block["bbox"][2],
378
+ y1=block["bbox"][3]
379
+ )
380
+
381
+ # ํ…์ŠคํŠธ ๋‚ด์šฉ ์ถ”์ถœ
382
+ text_content = ""
383
+ for line in block["lines"]:
384
+ for span in line["spans"]:
385
+ text_content += span["text"]
386
+ text_content += "\n"
387
 
388
+ text_content = text_content.strip()
 
 
389
 
390
+ if text_content: # ๋นˆ ํ…์ŠคํŠธ ๋ธ”๋ก์€ ์ œ์™ธ
391
+ text_block = PDFBlock(
392
+ block_id=f"page_{page_num + 1}_text_{block_idx}",
393
+ block_type="text",
394
+ bbox=bbox,
395
+ content=text_content,
396
+ page_num=page_num + 1,
397
+ metadata={
398
+ "font_info": self._extract_font_info(block),
399
+ "word_count": len(text_content.split()),
400
+ "char_count": len(text_content)
401
+ }
402
+ )
403
+ text_blocks.append(text_block)
404
+
405
+ except Exception as e:
406
+ logger.warning(f"โš ๏ธ ํŽ˜์ด์ง€ {page_num + 1} ํ…์ŠคํŠธ ๋ธ”๋ก ์ถ”์ถœ ์‹คํŒจ: {e}")
407
+
408
+ return text_blocks
409
+
410
+ def _extract_image_blocks(self, page, page_num: int, doc=None) -> List[PDFBlock]:
411
+ """ํŽ˜์ด์ง€์—์„œ ์ด๋ฏธ์ง€ ๋ธ”๋ก ์ถ”์ถœ"""
412
+ image_blocks = []
413
+
414
+ try:
415
+ import fitz # PyMuPDF
416
+
417
+ # 1. ์ž„๋ฒ ๋””๋“œ ์ด๋ฏธ์ง€ ์ถ”์ถœ
418
+ images = page.get_images()
419
+
420
+ for img_idx, img_info in enumerate(images):
421
+ try:
422
+ # ์ด๋ฏธ์ง€ ์ถ”์ถœ - doc ๊ฐ์ฒด๋ฅผ ์ง์ ‘ ์‚ฌ์šฉ
423
+ xref = img_info[0]
424
+ if doc:
425
  pix = fitz.Pixmap(doc, xref)
426
+ else:
427
+ # fallback: ํŽ˜์ด์ง€์—์„œ ์ง์ ‘ ์ถ”์ถœ ์‹œ๋„
428
+ pix = page.get_pixmap()
429
+ continue # ์ด ๊ฒฝ์šฐ๋Š” ๊ฑด๋„ˆ๋›ฐ๊ธฐ
430
+
431
+ if pix.n - pix.alpha < 4: # GRAY or RGB
432
+ if pix.colorspace and pix.colorspace.n > 3:
433
  pix = fitz.Pixmap(fitz.csRGB, pix)
434
+
435
+ img_data = pix.tobytes("png")
436
+ img_pil = Image.open(io.BytesIO(img_data))
437
+
438
+ if self._is_valid_image(img_pil):
439
+ # ์ด๋ฏธ์ง€์˜ ์œ„์น˜ ์ •๋ณด ์ถ”์ถœ (์ค‘์š”!)
440
+ img_rect = self._get_image_rect(page, xref)
441
 
442
+ if img_rect:
443
+ bbox = BoundingBox(
444
+ x0=img_rect.x0,
445
+ y0=img_rect.y0,
446
+ x1=img_rect.x1,
447
+ y1=img_rect.y1
448
+ )
449
+
450
+ image_block = PDFBlock(
451
+ block_id=f"page_{page_num + 1}_image_{img_idx}",
452
+ block_type="image",
453
+ bbox=bbox,
454
+ content=img_data, # ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ
455
+ page_num=page_num + 1,
456
+ metadata={
457
+ "image_size": img_pil.size,
458
+ "image_format": "PNG",
459
+ "image_mode": img_pil.mode,
460
+ "xref": xref,
461
+ "is_embedded": True
462
+ }
463
+ )
464
+ image_blocks.append(image_block)
465
+
466
+ logger.debug(f"๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€ ๋ธ”๋ก ์ถ”์ถœ: ํŽ˜์ด์ง€ {page_num + 1}, "
467
+ f"์œ„์น˜ ({bbox.x0:.1f}, {bbox.y0:.1f}, {bbox.x1:.1f}, {bbox.y1:.1f}), "
468
+ f"ํฌ๊ธฐ {img_pil.size}")
469
+
470
+ pix = None
471
+
472
+ except Exception as e:
473
+ logger.warning(f"โš ๏ธ ์ด๋ฏธ์ง€ {img_idx} ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
474
+
475
+ # 2. ์ด๋ฏธ์ง€๊ฐ€ ์—†์œผ๋ฉด ์ „์ฒด ํŽ˜์ด์ง€ ๋ Œ๋”๋ง (fallback)
476
+ if not image_blocks:
477
+ # ํ˜„์žฌ ๋ชจ๋ธ์ด ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ์ธ์ง€ ํ™•์ธ
478
+ try:
479
+ from lily_llm_api.app_v2 import current_profile
480
+ is_multimodal = hasattr(current_profile, 'multimodal') and current_profile.multimodal
481
+
482
+ if is_multimodal:
483
+ # ์ „์ฒด ํŽ˜์ด์ง€๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ Œ๋”๋ง
484
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2๋ฐฐ ํ•ด์ƒ๋„
485
  img_data = pix.tobytes("png")
 
486
 
487
+ bbox = BoundingBox(
488
+ x0=0, y0=0,
489
+ x1=page.rect.width,
490
+ y1=page.rect.height
491
+ )
 
 
 
 
 
 
 
492
 
493
+ image_block = PDFBlock(
494
+ block_id=f"page_{page_num + 1}_fullpage",
495
+ block_type="image",
496
+ bbox=bbox,
497
+ content=img_data,
498
+ page_num=page_num + 1,
499
+ metadata={
500
+ "image_size": (pix.width, pix.height),
501
+ "image_format": "PNG",
502
+ "is_embedded": False,
503
+ "is_full_page_render": True
504
+ }
505
+ )
506
+ image_blocks.append(image_block)
507
 
508
+ logger.debug(f"๐Ÿ“„ ์ „์ฒด ํŽ˜์ด์ง€ ๋ Œ๋”๋ง: ํŽ˜์ด์ง€ {page_num + 1}")
509
+ pix = None
 
510
 
511
+ except ImportError:
512
+ pass # app_v2 import ์‹คํŒจ ์‹œ ๋ฌด์‹œ
513
+
514
+ except Exception as e:
515
+ logger.warning(f"โš ๏ธ ํŽ˜์ด์ง€ {page_num + 1} ์ด๋ฏธ์ง€ ๋ธ”๋ก ์ถ”์ถœ ์‹คํŒจ: {e}")
516
+
517
+ return image_blocks
518
+
519
+ def _get_image_rect(self, page, xref: int) -> Optional[Any]:
520
+ """์ด๋ฏธ์ง€์˜ ์‹ค์ œ ์œ„์น˜(rect) ์ฐพ๊ธฐ"""
521
+ try:
522
+ import fitz # PyMuPDF
523
+
524
+ # ํŽ˜์ด์ง€์˜ ๋ชจ๋“  ์ด๋ฏธ์ง€ ์ฐธ์กฐ์—์„œ ์œ„์น˜ ์ฐพ๊ธฐ
525
+ for item in page.get_images(full=True):
526
+ if item[0] == xref: # xref๊ฐ€ ์ผ์น˜ํ•˜๋ฉด
527
+ # ์ด๋ฏธ์ง€๊ฐ€ ์‚ฌ์šฉ๋œ ์œ„์น˜ ์ฐพ๊ธฐ
528
+ image_list = page.get_image_info()
529
+ for img_info in image_list:
530
+ if img_info["xref"] == xref:
531
+ return fitz.Rect(img_info["bbox"])
532
+
533
+ # ๋Œ€์•ˆ: ํŽ˜์ด์ง€ ๋‚ด์šฉ์—์„œ ์ด๋ฏธ์ง€ ์œ„์น˜ ๊ฒ€์ƒ‰
534
+ blocks = page.get_text("dict")["blocks"]
535
+ for block in blocks:
536
+ if block.get("type") == 1: # ์ด๋ฏธ์ง€ ๋ธ”๋ก
537
+ if block.get("xref") == xref:
538
+ return fitz.Rect(block["bbox"])
539
+
540
+ return None
541
+
542
+ except Exception as e:
543
+ logger.debug(f"โš ๏ธ ์ด๋ฏธ์ง€ ์œ„์น˜ ์ฐพ๊ธฐ ์‹คํŒจ: {e}")
544
+ return None
545
+
546
+ def _extract_font_info(self, block: Dict) -> Dict[str, Any]:
547
+ """ํ…์ŠคํŠธ ๋ธ”๋ก์—์„œ ํฐํŠธ ์ •๋ณด ์ถ”์ถœ"""
548
+ font_info = {
549
+ "fonts": [],
550
+ "sizes": [],
551
+ "flags": []
552
+ }
553
+
554
+ try:
555
+ for line in block.get("lines", []):
556
+ for span in line.get("spans", []):
557
+ font_info["fonts"].append(span.get("font", ""))
558
+ font_info["sizes"].append(span.get("size", 0))
559
+ font_info["flags"].append(span.get("flags", 0))
560
+
561
+ # ์ค‘๋ณต ์ œ๊ฑฐ
562
+ font_info["fonts"] = list(set(font_info["fonts"]))
563
+ font_info["sizes"] = list(set(font_info["sizes"]))
564
+ font_info["flags"] = list(set(font_info["flags"]))
565
+
566
+ except Exception as e:
567
+ logger.debug(f"โš ๏ธ ํฐํŠธ ์ •๋ณด ์ถ”์ถœ ์‹คํŒจ: {e}")
568
+
569
+ return font_info
570
+
571
+ def _analyze_block_relationships(self, pdf_structure: PDFStructure):
572
+ """๋ธ”๋ก ๊ฐ„ ๊ด€๊ณ„ ๋ถ„์„ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์— ์ €์žฅ"""
573
+
574
+ for page in pdf_structure.pages:
575
+ text_blocks = page.get_blocks_by_type("text")
576
+ image_blocks = page.get_blocks_by_type("image")
577
+
578
+ # ๊ฐ ์ด๋ฏธ์ง€ ๋ธ”๋ก์— ๋Œ€ํ•ด ๊ด€๋ จ๋œ ํ…์ŠคํŠธ ๋ธ”๋ก ์ฐพ๊ธฐ
579
+ for image_block in image_blocks:
580
+ related_texts = []
581
+
582
+ for text_block in text_blocks:
583
+ # ๊ณต๊ฐ„์  ๊ด€๊ณ„ ๋ถ„์„
584
+ if image_block.is_near(text_block, threshold=100): # 100ํ”ฝ์…€ ์ด๋‚ด
585
+ relation = self._determine_spatial_relationship(image_block, text_block)
586
+ related_texts.append({
587
+ "block_id": text_block.block_id,
588
+ "relationship": relation,
589
+ "distance": image_block.bbox.distance_to(text_block.bbox),
590
+ "content_preview": text_block.content[:100] + "..." if len(text_block.content) > 100 else text_block.content
591
+ })
592
+
593
+ # ๊ฑฐ๋ฆฌ์ˆœ์œผ๋กœ ์ •๋ ฌ
594
+ related_texts.sort(key=lambda x: x["distance"])
595
+ image_block.metadata["related_texts"] = related_texts[:3] # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด 3๊ฐœ๋งŒ ์ €์žฅ
596
+
597
+ logger.debug(f"๐Ÿ”— ์ด๋ฏธ์ง€ ๋ธ”๋ก {image_block.block_id}์— "
598
+ f"{len(related_texts)}๊ฐœ ํ…์ŠคํŠธ ๋ธ”๋ก ์—ฐ๊ฒฐ")
599
+
600
+ def _determine_spatial_relationship(self, block1: PDFBlock, block2: PDFBlock) -> str:
601
+ """๋‘ ๋ธ”๋ก ๊ฐ„์˜ ๊ณต๊ฐ„์  ๊ด€๊ณ„ ๊ฒฐ์ •"""
602
+ if block1.is_above(block2):
603
+ return "above"
604
+ elif block1.is_below(block2):
605
+ return "below"
606
+ elif block1.is_left_of(block2):
607
+ return "left"
608
+ elif block1.is_right_of(block2):
609
+ return "right"
610
+ else:
611
+ return "near"
612
+
613
+ def _convert_structure_to_documents(self, pdf_structure: PDFStructure) -> List[Document]:
614
+ """
615
+ PDF ๊ตฌ์กฐ๋ฅผ Document ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜ (์‹ค๋ฌด ์ˆ˜์ค€ - ๊ณต๊ฐ„์  ๊ด€๊ณ„ ํฌํ•จ)
616
+ """
617
+ documents = []
618
+
619
+ try:
620
+ for page in pdf_structure.pages:
621
+ text_blocks = page.get_blocks_by_type("text")
622
+ image_blocks = page.get_blocks_by_type("image")
623
+
624
+ if not text_blocks and not image_blocks:
625
+ continue # ๋นˆ ํŽ˜์ด์ง€ ๊ฑด๋„ˆ๋›ฐ๊ธฐ
626
+
627
+ # ํŽ˜์ด์ง€๋ณ„ ๊ตฌ์กฐํ™”๋œ ์ปจํ…์ธ  ์ƒ์„ฑ
628
+ page_content = self._build_structured_content(page, text_blocks, image_blocks)
629
+
630
+ # ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ ์ค€๋น„ (๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ฒ˜๋ฆฌ์šฉ)
631
+ image_data_list = []
632
+ image_metadata = []
633
+
634
+ for img_block in image_blocks:
635
+ image_data_list.append(img_block.content) # ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ
636
+ image_metadata.append({
637
+ "block_id": img_block.block_id,
638
+ "bbox": {
639
+ "x0": img_block.bbox.x0, "y0": img_block.bbox.y0,
640
+ "x1": img_block.bbox.x1, "y1": img_block.bbox.y1
641
+ },
642
+ "size": img_block.metadata.get("image_size"),
643
+ "related_texts": img_block.metadata.get("related_texts", [])
644
+ })
645
+
646
+ # Document ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ตฌ์„ฑ
647
+ metadata = {
648
+ "source": pdf_structure.metadata.get("source", "unknown"),
649
+ "page": page.page_num,
650
+ "total_pages": len(pdf_structure.pages),
651
+ "has_text": len(text_blocks) > 0,
652
+ "has_images": len(image_blocks) > 0,
653
+ "text_block_count": len(text_blocks),
654
+ "image_block_count": len(image_blocks),
655
+ "page_width": page.width,
656
+ "page_height": page.height,
657
 
658
+ # ์‹ค๋ฌด ์ˆ˜์ค€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ
659
+ "structured_analysis": True,
660
+ "spatial_relationships": True,
661
 
662
+ # ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ฒ˜๋ฆฌ์šฉ ๋ฐ์ดํ„ฐ
663
+ "multimodal_ready": len(image_blocks) > 0,
664
+ "image_data_list": image_data_list,
665
+ "image_metadata": image_metadata,
666
 
667
+ # ๋ธ”๋ก ๊ตฌ์กฐ ์ •๋ณด
668
+ "text_blocks": [
669
+ {
670
+ "block_id": tb.block_id,
671
+ "bbox": {"x0": tb.bbox.x0, "y0": tb.bbox.y0, "x1": tb.bbox.x1, "y1": tb.bbox.y1},
672
+ "word_count": tb.metadata.get("word_count", 0),
673
+ "content_preview": tb.content[:50] + "..." if len(tb.content) > 50 else tb.content
674
+ }
675
+ for tb in text_blocks
676
+ ],
677
+ "image_blocks": [
678
+ {
679
+ "block_id": ib.block_id,
680
+ "bbox": {"x0": ib.bbox.x0, "y0": ib.bbox.y0, "x1": ib.bbox.x1, "y1": ib.bbox.y1},
681
+ "size": ib.metadata.get("image_size"),
682
+ "related_text_count": len(ib.metadata.get("related_texts", []))
683
+ }
684
+ for ib in image_blocks
685
+ ]
686
+ }
687
+
688
+ # Document ๊ฐ์ฒด ์ƒ์„ฑ
689
+ doc = Document(
690
+ page_content=page_content,
691
+ metadata=metadata
692
+ )
693
+ documents.append(doc)
694
+
695
+ logger.info(f"๐Ÿ“„ ํŽ˜์ด์ง€ {page.page_num} Document ์ƒ์„ฑ: "
696
+ f"ํ…์ŠคํŠธ ๋ธ”๋ก {len(text_blocks)}๊ฐœ, ์ด๋ฏธ์ง€ ๋ธ”๋ก {len(image_blocks)}๊ฐœ")
697
+
698
+ except Exception as e:
699
+ logger.error(f"โŒ PDF ๊ตฌ์กฐ โ†’ Document ๋ณ€ํ™˜ ์‹คํŒจ: {e}")
700
+ import traceback
701
+ traceback.print_exc()
702
+
703
+ return documents
704
+
705
+ def _build_structured_content(self, page: PDFPage, text_blocks: List[PDFBlock],
706
+ image_blocks: List[PDFBlock]) -> str:
707
+ """
708
+ ํŽ˜์ด์ง€์˜ ๊ตฌ์กฐํ™”๋œ ์ปจํ…์ธ  ์ƒ์„ฑ (๊ณต๊ฐ„์  ๊ด€๊ณ„ ๊ธฐ๋ฐ˜)
709
+ """
710
+ content_parts = []
711
+
712
+ # ํŽ˜์ด์ง€ ํ—ค๋”
713
+ content_parts.append(f"=== ํŽ˜์ด์ง€ {page.page_num} ===")
714
+
715
+ # ํ…์ŠคํŠธ ๋ธ”๋ก๋“ค์„ Y ์ขŒํ‘œ ์ˆœ์œผ๋กœ ์ •๋ ฌ (์œ„์—์„œ ์•„๋ž˜๋กœ)
716
+ sorted_text_blocks = sorted(text_blocks, key=lambda b: b.bbox.y0)
717
+
718
+ # ์ด๋ฏธ์ง€-ํ…์ŠคํŠธ ๊ด€๊ณ„๋ฅผ ๊ณ ๋ คํ•œ ์ปจํ…์ธ  ๊ตฌ์„ฑ
719
+ processed_images = set()
720
+
721
+ for text_block in sorted_text_blocks:
722
+ # 1. ํ…์ŠคํŠธ ๋ธ”๋ก ์ถ”๊ฐ€
723
+ content_parts.append(f"\n[ํ…์ŠคํŠธ ๋ธ”๋ก {text_block.block_id}]")
724
+ content_parts.append(text_block.content)
725
+
726
+ # 2. ์ด ํ…์ŠคํŠธ ๋ธ”๋ก๊ณผ ๊ด€๋ จ๋œ ์ด๋ฏธ์ง€ ์ฐพ๊ธฐ
727
+ related_images = []
728
+ for img_block in image_blocks:
729
+ if img_block.block_id in processed_images:
730
+ continue
731
 
732
+ # ๊ณต๊ฐ„์ ์œผ๋กœ ๊ฐ€๊นŒ์šด ์ด๋ฏธ์ง€ ์ฐพ๊ธฐ
733
+ if text_block.is_near(img_block, threshold=150):
734
+ related_images.append(img_block)
735
+
736
+ # 3. ๊ด€๋ จ๋œ ์ด๋ฏธ์ง€๋“ค์„ ๊ฑฐ๋ฆฌ์ˆœ์œผ๋กœ ์ •๋ ฌํ•˜์—ฌ ์ถ”๊ฐ€
737
+ if related_images:
738
+ related_images.sort(key=lambda img: text_block.bbox.distance_to(img.bbox))
739
+
740
+ for img_block in related_images[:2]: # ์ตœ๋Œ€ 2๊ฐœ ์ด๋ฏธ์ง€๋งŒ
741
+ relationship = self._determine_spatial_relationship(text_block, img_block)
742
+ content_parts.append(f"\n[์ด๋ฏธ์ง€ {img_block.block_id} - {relationship} ๊ด€๊ณ„]")
743
+ content_parts.append(f"์ด๋ฏธ์ง€ ํฌ๊ธฐ: {img_block.metadata.get('image_size', 'unknown')}")
744
 
745
+ # ๊ด€๋ จ ํ…์ŠคํŠธ ์ •๋ณด ์ถ”๊ฐ€
746
+ related_texts = img_block.metadata.get("related_texts", [])
747
+ if related_texts:
748
+ content_parts.append("๊ด€๋ จ ํ…์ŠคํŠธ:")
749
+ for rt in related_texts[:2]: # ์ตœ๋Œ€ 2๊ฐœ๋งŒ
750
+ content_parts.append(f" - {rt['relationship']}: {rt['content_preview']}")
751
 
752
+ processed_images.add(img_block.block_id)
753
+
754
+ # 4. ์ฒ˜๋ฆฌ๋˜์ง€ ์•Š์€ ์ด๋ฏธ์ง€๋“ค ์ถ”๊ฐ€ (๋…๋ฆฝ์ ์ธ ์ด๋ฏธ์ง€๋“ค)
755
+ unprocessed_images = [img for img in image_blocks if img.block_id not in processed_images]
756
+ if unprocessed_images:
757
+ content_parts.append(f"\n[๋…๋ฆฝ์ ์ธ ์ด๋ฏธ์ง€๋“ค]")
758
+ for img_block in unprocessed_images:
759
+ content_parts.append(f"\n[์ด๋ฏธ์ง€ {img_block.block_id}]")
760
+ content_parts.append(f"์ด๋ฏธ์ง€ ํฌ๊ธฐ: {img_block.metadata.get('image_size', 'unknown')}")
761
+ content_parts.append(f"์œ„์น˜: ({img_block.bbox.x0:.1f}, {img_block.bbox.y0:.1f})")
762
+
763
+ # 5. ์š”์•ฝ ์ •๋ณด ์ถ”๊ฐ€
764
+ content_parts.append(f"\n[ํŽ˜์ด์ง€ {page.page_num} ์š”์•ฝ]")
765
+ content_parts.append(f"ํ…์ŠคํŠธ ๋ธ”๋ก: {len(text_blocks)}๊ฐœ")
766
+ content_parts.append(f"์ด๋ฏธ์ง€ ๋ธ”๋ก: {len(image_blocks)}๊ฐœ")
767
+ content_parts.append(f"ํŽ˜์ด์ง€ ํฌ๊ธฐ: {page.width:.1f} x {page.height:.1f}")
768
+
769
+ return "\n".join(content_parts)
770
+
 
 
 
 
 
771
  def _is_valid_image(self, img: Image.Image) -> bool:
772
  """์ด๋ฏธ์ง€ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ"""
773
  try:
774
+ logger.debug(f"๐Ÿ” ์ด๋ฏธ์ง€ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ: ํฌ๊ธฐ={img.size}, ๋ชจ๋“œ={img.mode}")
775
+
776
  # ์ตœ์†Œ/์ตœ๋Œ€ ํฌ๊ธฐ ํ™•์ธ
777
  if img.size[0] < self.min_image_size[0] or img.size[1] < self.min_image_size[1]:
778
+ logger.debug(f"โŒ ์ด๋ฏธ์ง€ ํฌ๊ธฐ๊ฐ€ ๋„ˆ๋ฌด ์ž‘์Œ: {img.size} < {self.min_image_size}")
779
  return False
780
  if img.size[0] > self.max_image_size[0] or img.size[1] > self.max_image_size[1]:
781
+ logger.debug(f"โŒ ์ด๋ฏธ์ง€ ํฌ๊ธฐ๊ฐ€ ๋„ˆ๋ฌด ํผ: {img.size} > {self.max_image_size}")
782
  return False
783
 
784
  # ์ด๋ฏธ์ง€ ๋ชจ๋“œ ํ™•์ธ
785
  if img.mode not in ['RGB', 'RGBA', 'L']:
786
+ logger.debug(f"โŒ ์ง€์›ํ•˜์ง€ ์•Š๋Š” ์ด๋ฏธ์ง€ ๋ชจ๋“œ: {img.mode}")
787
  return False
788
 
789
+ logger.debug(f"โœ… ์ด๋ฏธ์ง€ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ ํ†ต๊ณผ: {img.size}, {img.mode}")
790
  return True
791
+ except Exception as e:
792
+ logger.debug(f"โŒ ์ด๋ฏธ์ง€ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ ์‹คํŒจ: {e}")
793
  return False
794
 
795
+ def _extract_text_from_image(self, img: Image.Image) -> str:
796
+ """์ด๋ฏธ์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ (OCR)"""
797
+ try:
798
+ # EasyOCR ์šฐ์„  ์‹œ๋„
799
+ if self.ocr_reader:
800
+ try:
801
+ result = self.ocr_reader.readtext(img)
802
+ if result:
803
+ text = " ".join([item[1] for item in result])
804
+ logger.info(f"โœ… EasyOCR๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ ์„ฑ๊ณต: {len(text)}์ž")
805
+ return text
806
+ except Exception as e:
807
+ logger.warning(f"โš ๏ธ EasyOCR ์‹คํŒจ: {e}")
808
+
809
+ # Tesseract fallback
810
+ try:
811
+ import pytesseract
812
+ # ๊ทธ๋ ˆ์ด์Šค์ผ€์ผ ๋ณ€ํ™˜ (Tesseract ์ตœ์ ํ™”)
813
+ if img.mode != 'L':
814
+ img_gray = img.convert('L')
815
+ else:
816
+ img_gray = img
817
+
818
+ text = pytesseract.image_to_string(img_gray, lang='kor+eng')
819
+ if text and text.strip():
820
+ logger.info(f"โœ… Tesseract๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ ์„ฑ๊ณต: {len(text.strip())}์ž")
821
+ return text.strip()
822
+ else:
823
+ logger.info("๐Ÿ” Tesseract OCR ๊ฒฐ๊ณผ ์—†์Œ")
824
+ return ""
825
+
826
+ except ImportError:
827
+ logger.warning("โš ๏ธ pytesseract๊ฐ€ ์„ค์น˜๋˜์ง€ ์•Š์Œ")
828
+ return ""
829
+ except Exception as e:
830
+ logger.warning(f"โš ๏ธ Tesseract OCR ์‹คํŒจ: {e}")
831
+ return ""
832
+
833
+ except Exception as e:
834
+ logger.error(f"โŒ ์ด๋ฏธ์ง€ OCR ์ „์ฒด ์‹คํŒจ: {e}")
835
+ return ""
836
+
837
  def _extract_formulas_from_documents(self, documents: List[Document]) -> List[Document]:
838
+ """๋ฌธ์„œ์—์„œ ์ˆ˜์‹ ์ถ”์ถœ (ํ˜„์žฌ ๋น„ํ™œ์„ฑํ™”๋จ)"""
839
+ # ์ˆ˜์‹ ์ถ”์ถœ ๊ธฐ๋Šฅ์€ ํ˜„์žฌ ๋น„ํ™œ์„ฑํ™”
 
 
840
  return documents
841
 
842
  def get_document_info(self, file_path: str) -> Dict[str, Any]:
843
+ """๋ฌธ์„œ ์ •๋ณด ์กฐํšŒ"""
844
  try:
845
+ if not os.path.exists(file_path):
846
+ return {'supported': False, 'error': 'ํŒŒ์ผ์ด ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.'}
847
+
848
+ file_type = self.get_file_type(file_path)
849
+
850
+ if file_type not in ['pdf', 'docx', 'pptx']:
851
+ return {'supported': False, 'error': f'์ง€์›ํ•˜์ง€ ์•Š๋Š” ํŒŒ์ผ ํ˜•์‹: {file_type}'}
852
+
853
+ # ๋ฌธ์„œ ๋กœ๋“œ ์‹œ๋„
854
+ documents = self.process_document(file_path)
855
+
856
+ if not documents:
857
+ return {'supported': False, 'error': '๋ฌธ์„œ ๋‚ด์šฉ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.'}
858
 
859
+ # ํ†ต๊ณ„ ์ •๋ณด ์ˆ˜์ง‘
860
  total_text = ""
861
  for doc in documents:
862
+ if hasattr(doc, 'page_content'):
863
+ total_text += doc.page_content + " "
864
 
865
  return {
866
  'file_path': file_path,
 
882
  document_processor = DocumentProcessor(formula_ocr_engine='latexocr')
883
  # ํ•„์š”์‹œ ๋‹ค๋ฅธ ์—”์ง„์œผ๋กœ ๋ณ€๊ฒฝ ๊ฐ€๋Šฅ:
884
  # document_processor = DocumentProcessor(formula_ocr_engine='easyocr') # EasyOCR ์‚ฌ์šฉ
885
+ # document_processor = DocumentProcessor(formula_ocr_engine='mathpix') # MathPix API ์‚ฌ์šฉ
lily_llm_core/integrated_memory_manager.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ํ†ตํ•ฉ ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ์ž (Integrated Memory Manager)
4
+ ๊ณ„์ธต์  ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ์„ ํ†ตํ•ฉ ๊ด€๋ฆฌํ•˜๋Š” ์ค‘์•™ ๊ด€๋ฆฌ์ž
5
+ """
6
+
7
+ import logging
8
+ import time
9
+ from typing import Dict, Any, List, Optional, Tuple
10
+ from dataclasses import dataclass
11
+
12
+ from .user_memory_manager import user_memory_manager, UserMemory
13
+ from .room_context_manager import room_context_manager, RoomContext
14
+ from .context_manager import AdvancedContextManager
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ @dataclass
19
+ class MemoryContext:
20
+ """๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ํ†ตํ•ฉ ๋ฐ์ดํ„ฐ"""
21
+ # ์‚ฌ์šฉ์ž ์ •๋ณด
22
+ user_id: str
23
+ user_memory: UserMemory
24
+
25
+ # ์ฑ„ํŒ…๋ฐฉ ์ •๋ณด
26
+ room_id: str
27
+ room_context: RoomContext
28
+
29
+ # ์„ธ์…˜ ์ •๋ณด
30
+ session_id: str
31
+
32
+ # ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ
33
+ combined_context: str = ""
34
+ memory_summary: str = ""
35
+
36
+ class IntegratedMemoryManager:
37
+ """๊ณ„์ธต์  ๋ฉ”๋ชจ๋ฆฌ ์‹œ์Šคํ…œ ํ†ตํ•ฉ ๊ด€๋ฆฌ์ž"""
38
+
39
+ def __init__(self):
40
+ self.user_memory_manager = user_memory_manager
41
+ self.room_context_manager = room_context_manager
42
+
43
+ # ์„ธ์…˜๋ณ„ ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž
44
+ self.session_context_managers: Dict[str, AdvancedContextManager] = {}
45
+
46
+ logger.info("๐Ÿš€ IntegratedMemoryManager ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
47
+
48
+ def get_memory_context(self, user_id: str, room_id: str, session_id: str) -> MemoryContext:
49
+ """ํ†ตํ•ฉ ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ์กฐํšŒ"""
50
+ try:
51
+ # 1. ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์กฐํšŒ
52
+ user_memory = self.user_memory_manager.get_user_memory(user_id)
53
+
54
+ # 2. ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์กฐํšŒ
55
+ room_context = self.room_context_manager.get_room_context(room_id)
56
+
57
+ # 3. ์„ธ์…˜๋ณ„ ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž ํ™•์ธ/์ƒ์„ฑ
58
+ if session_id not in self.session_context_managers:
59
+ self.session_context_managers[session_id] = AdvancedContextManager(
60
+ max_tokens=2000,
61
+ max_turns=20,
62
+ strategy="sliding_window"
63
+ )
64
+
65
+ # 4. ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
66
+ combined_context = self._create_combined_context(
67
+ user_memory, room_context, session_id
68
+ )
69
+
70
+ # 5. ๋ฉ”๋ชจ๋ฆฌ ์š”์•ฝ ์ƒ์„ฑ
71
+ memory_summary = self._create_memory_summary(
72
+ user_memory, room_context
73
+ )
74
+
75
+ return MemoryContext(
76
+ user_id=user_id,
77
+ user_memory=user_memory,
78
+ room_id=room_id,
79
+ room_context=room_context,
80
+ session_id=session_id,
81
+ combined_context=combined_context,
82
+ memory_summary=memory_summary
83
+ )
84
+
85
+ except Exception as e:
86
+ logger.error(f"โŒ ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ์กฐํšŒ ์‹คํŒจ: {user_id}/{room_id}/{session_id} - {e}")
87
+ # ๊ธฐ๋ณธ๊ฐ’ ๋ฐ˜ํ™˜
88
+ return self._create_default_memory_context(user_id, room_id, session_id)
89
+
90
+ def add_document_to_room(self, room_id: str, document_info: Dict[str, Any]) -> bool:
91
+ """์ฑ„ํŒ…๋ฐฉ์— ๋ฌธ์„œ ์ถ”๊ฐ€"""
92
+ try:
93
+ # ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ์— ๋ฌธ์„œ ์ถ”๊ฐ€
94
+ success = self.room_context_manager.add_document(room_id, document_info)
95
+
96
+ if success:
97
+ # ์‚ฌ์šฉ์ž ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
98
+ user_id = document_info.get("uploaded_by", "unknown")
99
+ if user_id != "unknown":
100
+ self.user_memory_manager.record_conversation(
101
+ user_id,
102
+ topic=f"๋ฌธ์„œ ์—…๋กœ๋“œ: {document_info['filename']}"
103
+ )
104
+
105
+ logger.info(f"โœ… ๋ฌธ์„œ ์ถ”๊ฐ€ ์™„๋ฃŒ: {room_id} - {document_info['filename']}")
106
+
107
+ return success
108
+
109
+ except Exception as e:
110
+ logger.error(f"โŒ ๋ฌธ์„œ ์ถ”๊ฐ€ ์‹คํŒจ: {room_id} - {e}")
111
+ return False
112
+
113
+ def update_user_preferences(self, user_id: str, preferences: Dict[str, Any]) -> bool:
114
+ """์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„ ์—…๋ฐ์ดํŠธ"""
115
+ try:
116
+ success = self.user_memory_manager.update_preferences(user_id, preferences)
117
+
118
+ if success:
119
+ logger.info(f"โœ… ์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„ ์—…๋ฐ์ดํŠธ: {user_id} - {len(preferences)}๊ฐœ ํ•ญ๋ชฉ")
120
+
121
+ return success
122
+
123
+ except Exception as e:
124
+ logger.error(f"โŒ ์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„ ์—…๋ฐ์ดํŠธ ์‹คํŒจ: {user_id} - {e}")
125
+ return False
126
+
127
+ def add_important_info(self, user_id: str, info: str) -> bool:
128
+ """์‚ฌ์šฉ์ž ์ค‘์š” ์ •๋ณด ์ถ”๊ฐ€"""
129
+ try:
130
+ success = self.user_memory_manager.add_important_info(user_id, info)
131
+
132
+ if success:
133
+ logger.info(f"โœ… ์ค‘์š” ์ •๋ณด ์ถ”๊ฐ€: {user_id} - {info[:50]}...")
134
+
135
+ return success
136
+
137
+ except Exception as e:
138
+ logger.error(f"โŒ ์ค‘์š” ์ •๋ณด ์ถ”๊ฐ€ ์‹คํŒจ: {user_id} - {e}")
139
+ return False
140
+
141
+ def record_conversation(self, user_id: str, room_id: str, topic: str = None) -> bool:
142
+ """๋Œ€ํ™” ๊ธฐ๋ก (ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ)"""
143
+ try:
144
+ # ์‚ฌ์šฉ์ž ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
145
+ user_success = self.user_memory_manager.record_conversation(user_id, topic)
146
+
147
+ # ์ฑ„ํŒ…๋ฐฉ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
148
+ room_success = self.room_context_manager.increment_message_count(room_id)
149
+
150
+ # ์ฐธ๊ฐ€์ž ์ถ”๊ฐ€
151
+ self.room_context_manager.add_participant(room_id, user_id)
152
+
153
+ return user_success and room_success
154
+
155
+ except Exception as e:
156
+ logger.error(f"โŒ ๋Œ€ํ™” ๊ธฐ๋ก ์‹คํŒจ: {user_id}/{room_id} - {e}")
157
+ return False
158
+
159
+ def get_context_for_ai(self, user_id: str, room_id: str, session_id: str,
160
+ include_user_memory: bool = True,
161
+ include_room_context: bool = True,
162
+ include_session_history: bool = True) -> str:
163
+ """AI ์‘๋‹ต ์ƒ์„ฑ์„ ์œ„ํ•œ ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ"""
164
+ try:
165
+ context_parts = []
166
+
167
+ # 1. ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ (์ „์—ญ ์žฅ๊ธฐ ๊ธฐ์–ต)
168
+ if include_user_memory:
169
+ user_memory = self.user_memory_manager.get_user_memory(user_id)
170
+ user_context = self._format_user_memory_for_ai(user_memory)
171
+ if user_context:
172
+ context_parts.append(f"=== ์‚ฌ์šฉ์ž ์ •๋ณด ===\n{user_context}")
173
+
174
+ # 2. ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ (์ค‘๊ธฐ ๊ธฐ์–ต)
175
+ if include_room_context:
176
+ room_context = self.room_context_manager.get_room_context(room_id)
177
+ room_context_str = self._format_room_context_for_ai(room_context)
178
+ if room_context_str:
179
+ context_parts.append(f"=== ์ฑ„ํŒ…๋ฐฉ ์ •๋ณด ===\n{room_context_str}")
180
+
181
+ # 3. ์„ธ์…˜ ํžˆ์Šคํ† ๋ฆฌ (์ดˆ๋‹จ๊ธฐ ๊ธฐ์–ต)
182
+ if include_session_history and session_id in self.session_context_managers:
183
+ session_manager = self.session_context_managers[session_id]
184
+ session_context = session_manager.get_context(include_system=True, max_length=1000)
185
+ if session_context:
186
+ context_parts.append(f"=== ํ˜„์žฌ ๋Œ€ํ™” ===\n{session_context}")
187
+
188
+ # ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
189
+ combined_context = "\n\n".join(context_parts)
190
+
191
+ logger.debug(f"๐Ÿ”— AI์šฉ ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ: {user_id}/{room_id}/{session_id} - {len(combined_context)} ๋ฌธ์ž")
192
+ return combined_context
193
+
194
+ except Exception as e:
195
+ logger.error(f"โŒ AI์šฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ ์‹คํŒจ: {user_id}/{room_id}/{session_id} - {e}")
196
+ return ""
197
+
198
+ def _create_combined_context(self, user_memory: UserMemory,
199
+ room_context: RoomContext,
200
+ session_id: str) -> str:
201
+ """ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ"""
202
+ try:
203
+ context_parts = []
204
+
205
+ # ์‚ฌ์šฉ์ž ๊ธฐ๋ณธ ์ •๋ณด
206
+ if user_memory.name:
207
+ context_parts.append(f"์‚ฌ์šฉ์ž: {user_memory.name}")
208
+
209
+ # ์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„
210
+ if user_memory.preferences:
211
+ prefs = ", ".join([f"{k}: {v}" for k, v in user_memory.preferences.items()])
212
+ context_parts.append(f"์„ ํ˜ธ๋„: {prefs}")
213
+
214
+ # ์ฑ„ํŒ…๋ฐฉ ์ •๋ณด
215
+ if room_context.room_name:
216
+ context_parts.append(f"์ฑ„ํŒ…๋ฐฉ: {room_context.room_name}")
217
+
218
+ # ๋ฌธ์„œ ์ •๋ณด
219
+ if room_context.documents:
220
+ doc_names = []
221
+ for d in room_context.documents[-3:]:
222
+ if isinstance(d, dict):
223
+ filename = d.get('filename', 'unknown')
224
+ else:
225
+ filename = getattr(d, 'filename', 'unknown')
226
+ doc_names.append(filename)
227
+
228
+ doc_info = f"๋ฌธ์„œ {len(room_context.documents)}๊ฐœ: " + ", ".join(doc_names)
229
+ context_parts.append(doc_info)
230
+
231
+ return "\n".join(context_parts)
232
+
233
+ except Exception as e:
234
+ logger.error(f"โŒ ํ†ตํ•ฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ ์‹คํŒจ: {e}")
235
+ return ""
236
+
237
+ def _create_memory_summary(self, user_memory: UserMemory,
238
+ room_context: RoomContext) -> str:
239
+ """๋ฉ”๋ชจ๋ฆฌ ์š”์•ฝ ์ƒ์„ฑ"""
240
+ try:
241
+ summary_parts = []
242
+
243
+ # ์‚ฌ์šฉ์ž ์š”์•ฝ
244
+ if user_memory.important_info:
245
+ summary_parts.append(f"์‚ฌ์šฉ์ž ์ค‘์š” ์ •๋ณด: {len(user_memory.important_info)}๊ฐœ")
246
+
247
+ if user_memory.expertise_areas:
248
+ summary_parts.append(f"์ „๋ฌธ ๋ถ„์•ผ: {', '.join(user_memory.expertise_areas)}")
249
+
250
+ # ์ฑ„ํŒ…๋ฐฉ ์š”์•ฝ
251
+ if room_context.conversation_summary:
252
+ summary_parts.append(f"๋Œ€ํ™” ์š”์•ฝ: {room_context.conversation_summary[:100]}...")
253
+
254
+ if room_context.key_topics:
255
+ summary_parts.append(f"์ฃผ์š” ์ฃผ์ œ: {', '.join(room_context.key_topics)}")
256
+
257
+ return " | ".join(summary_parts)
258
+
259
+ except Exception as e:
260
+ logger.error(f"โŒ ๋ฉ”๋ชจ๋ฆฌ ์š”์•ฝ ์ƒ์„ฑ ์‹คํŒจ: {e}")
261
+ return ""
262
+
263
+ def _format_user_memory_for_ai(self, user_memory: UserMemory) -> str:
264
+ """AI์šฉ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ํฌ๋งทํŒ…"""
265
+ try:
266
+ parts = []
267
+
268
+ if user_memory.name:
269
+ parts.append(f"์ด๋ฆ„: {user_memory.name}")
270
+
271
+ if user_memory.important_info:
272
+ parts.append(f"์ค‘์š” ์ •๋ณด: {', '.join(user_memory.important_info)}")
273
+
274
+ if user_memory.expertise_areas:
275
+ parts.append(f"์ „๋ฌธ ๋ถ„์•ผ: {', '.join(user_memory.expertise_areas)}")
276
+
277
+ if user_memory.interests:
278
+ parts.append(f"๊ด€์‹ฌ์‚ฌ: {', '.join(user_memory.interests)}")
279
+
280
+ if user_memory.communication_style:
281
+ parts.append(f"๋Œ€ํ™” ์Šคํƒ€์ผ: {user_memory.communication_style}")
282
+
283
+ if user_memory.ai_personality:
284
+ parts.append(f"AI ์„ฑ๊ฒฉ: {user_memory.ai_personality}")
285
+
286
+ return "\n".join(parts)
287
+
288
+ except Exception as e:
289
+ logger.error(f"โŒ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ํฌ๋งทํŒ… ์‹คํŒจ: {e}")
290
+ return ""
291
+
292
+ def _format_room_context_for_ai(self, room_context: RoomContext) -> str:
293
+ """AI์šฉ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ํฌ๋งทํŒ…"""
294
+ try:
295
+ parts = []
296
+
297
+ if room_context.room_name:
298
+ parts.append(f"์ฑ„ํŒ…๋ฐฉ: {room_context.room_name}")
299
+
300
+ if room_context.description:
301
+ parts.append(f"์„ค๋ช…: {room_context.description}")
302
+
303
+ if room_context.documents:
304
+ parts.append(f"์—…๋กœ๋“œ๋œ ๋ฌธ์„œ: {len(room_context.documents)}๊ฐœ")
305
+ for doc in room_context.documents[-3:]: # ์ตœ๊ทผ 3๊ฐœ๋งŒ
306
+ # ๋”•์…”๋„ˆ๋ฆฌ์™€ ๊ฐ์ฒด ๋ชจ๋‘ ์ฒ˜๋ฆฌ
307
+ if isinstance(doc, dict):
308
+ filename = doc.get('filename', 'unknown')
309
+ doc_type = doc.get('document_type', 'unknown')
310
+ page_count = doc.get('page_count', 0)
311
+ else:
312
+ filename = getattr(doc, 'filename', 'unknown')
313
+ doc_type = getattr(doc, 'document_type', 'unknown')
314
+ page_count = getattr(doc, 'page_count', 0)
315
+
316
+ parts.append(f" - {filename} ({doc_type}, {page_count}ํŽ˜์ด์ง€)")
317
+
318
+ if room_context.conversation_summary:
319
+ parts.append(f"๋Œ€ํ™” ์š”์•ฝ: {room_context.conversation_summary}")
320
+
321
+ if room_context.key_topics:
322
+ parts.append(f"์ฃผ์š” ์ฃผ์ œ: {', '.join(room_context.key_topics)}")
323
+
324
+ return "\n".join(parts)
325
+
326
+ except Exception as e:
327
+ logger.error(f"โŒ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ํฌ๋งทํŒ… ์‹คํŒจ: {e}")
328
+ return ""
329
+
330
+ def _create_default_memory_context(self, user_id: str, room_id: str, session_id: str) -> MemoryContext:
331
+ """๊ธฐ๋ณธ ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ (์˜ค๋ฅ˜ ์‹œ)"""
332
+ try:
333
+ user_memory = self.user_memory_manager.get_user_memory(user_id)
334
+ room_context = self.room_context_manager.get_room_context(room_id)
335
+
336
+ return MemoryContext(
337
+ user_id=user_id,
338
+ user_memory=user_memory,
339
+ room_id=room_id,
340
+ room_context=room_context,
341
+ session_id=session_id,
342
+ combined_context="",
343
+ memory_summary=""
344
+ )
345
+
346
+ except Exception as e:
347
+ logger.error(f"โŒ ๊ธฐ๋ณธ ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ ์‹คํŒจ: {e}")
348
+ # ์ตœ์†Œํ•œ์˜ ๊ธฐ๋ณธ๊ฐ’ ๋ฐ˜ํ™˜
349
+ return MemoryContext(
350
+ user_id=user_id,
351
+ user_memory=None,
352
+ room_id=room_id,
353
+ room_context=None,
354
+ session_id=session_id,
355
+ combined_context="",
356
+ memory_summary=""
357
+ )
358
+
359
+ def get_session_context_manager(self, session_id: str) -> AdvancedContextManager:
360
+ """์„ธ์…˜๋ณ„ ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž ๋ฐ˜ํ™˜"""
361
+ if session_id not in self.session_context_managers:
362
+ self.session_context_managers[session_id] = AdvancedContextManager(
363
+ max_tokens=2000,
364
+ max_turns=20,
365
+ strategy="sliding_window"
366
+ )
367
+
368
+ return self.session_context_managers[session_id]
369
+
370
+ def cleanup_old_sessions(self, max_age_hours: int = 24):
371
+ """์˜ค๋ž˜๋œ ์„ธ์…˜ ์ •๋ฆฌ"""
372
+ try:
373
+ current_time = time.time()
374
+ max_age_seconds = max_age_hours * 3600
375
+
376
+ sessions_to_remove = []
377
+
378
+ for session_id, manager in self.session_context_managers.items():
379
+ # ์„ธ์…˜ ID์—์„œ ํƒ€์ž„์Šคํƒฌํ”„ ์ถ”์ถœ ์‹œ๋„
380
+ try:
381
+ # session_id ํ˜•์‹: room_{room_id}_user_{user_id}_{timestamp}
382
+ if "_" in session_id:
383
+ timestamp_str = session_id.split("_")[-1]
384
+ timestamp = int(timestamp_str)
385
+
386
+ if current_time - timestamp > max_age_seconds:
387
+ sessions_to_remove.append(session_id)
388
+
389
+ except (ValueError, IndexError):
390
+ # ํƒ€์ž„์Šคํƒฌํ”„ ์ถ”์ถœ ์‹คํŒจ ์‹œ ๊ฑด๋„ˆ๋›ฐ๊ธฐ
391
+ continue
392
+
393
+ # ์˜ค๋ž˜๋œ ์„ธ์…˜ ์ œ๊ฑฐ
394
+ for session_id in sessions_to_remove:
395
+ del self.session_context_managers[session_id]
396
+ logger.info(f"๐Ÿ—‘๏ธ ์˜ค๋ž˜๋œ ์„ธ์…˜ ์ •๋ฆฌ: {session_id}")
397
+
398
+ logger.info(f"โœ… ์„ธ์…˜ ์ •๋ฆฌ ์™„๋ฃŒ: {len(sessions_to_remove)}๊ฐœ ์ œ๊ฑฐ")
399
+
400
+ except Exception as e:
401
+ logger.error(f"โŒ ์„ธ์…˜ ์ •๋ฆฌ ์‹คํŒจ: {e}")
402
+
403
+ # ์ „์—ญ ์ธ์Šคํ„ด์Šค
404
+ integrated_memory_manager = IntegratedMemoryManager()
lily_llm_core/room_context_manager.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ์ฑ„ํŒ…๋ฐฉ๋ณ„ ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž (Room Context Manager)
4
+ ์ฑ„ํŒ…๋ฐฉ๋ณ„๋กœ ๋…๋ฆฝ์ ์ธ ๋Œ€ํ™” ์ปจํ…์ŠคํŠธ์™€ ๋ฌธ์„œ ํžˆ์Šคํ† ๋ฆฌ๋ฅผ ๊ด€๋ฆฌ
5
+ """
6
+
7
+ import logging
8
+ import time
9
+ import json
10
+ import os
11
+ from typing import Dict, Any, List, Optional, Tuple
12
+ from dataclasses import dataclass, asdict
13
+ from pathlib import Path
14
+ from collections import deque
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ @dataclass
19
+ class RoomDocument:
20
+ """์ฑ„ํŒ…๋ฐฉ์— ์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ์ •๋ณด"""
21
+ document_id: str
22
+ filename: str
23
+ uploaded_at: float
24
+ uploaded_by: str
25
+ document_type: str
26
+ page_count: int
27
+ chunk_count: int
28
+ summary: Optional[str] = None
29
+ tags: List[str] = None
30
+
31
+ def __post_init__(self):
32
+ if self.tags is None:
33
+ self.tags = []
34
+
35
+ @dataclass
36
+ class RoomContext:
37
+ """์ฑ„ํŒ…๋ฐฉ๋ณ„ ์ปจํ…์ŠคํŠธ ๋ฐ์ดํ„ฐ"""
38
+ room_id: str
39
+ created_at: float
40
+ last_updated: float
41
+
42
+ # ๊ธฐ๋ณธ ์ •๋ณด
43
+ room_name: Optional[str] = None
44
+ description: Optional[str] = None
45
+ participants: List[str] = None
46
+
47
+ # ๋ฌธ์„œ ํžˆ์Šคํ† ๋ฆฌ
48
+ documents: List[RoomDocument] = None
49
+
50
+ # ๋Œ€ํ™” ์ปจํ…์ŠคํŠธ
51
+ conversation_summary: Optional[str] = None
52
+ key_topics: List[str] = None
53
+ total_messages: int = 0
54
+
55
+ # AI ์„ค์ •
56
+ ai_context: Dict[str, Any] = None
57
+
58
+ def __post_init__(self):
59
+ if self.participants is None:
60
+ self.participants = []
61
+ if self.documents is None:
62
+ self.documents = []
63
+ if self.key_topics is None:
64
+ self.key_topics = []
65
+ if self.ai_context is None:
66
+ self.ai_context = {}
67
+
68
+ class RoomContextManager:
69
+ """์ฑ„ํŒ…๋ฐฉ๋ณ„ ์ปจํ…์ŠคํŠธ ๊ด€๋ฆฌ์ž"""
70
+
71
+ def __init__(self, storage_dir: str = "room_contexts"):
72
+ self.storage_dir = Path(storage_dir)
73
+ self.storage_dir.mkdir(exist_ok=True)
74
+
75
+ # ์ปจํ…์ŠคํŠธ ์บ์‹œ
76
+ self.room_cache: Dict[str, RoomContext] = {}
77
+ self.cache_size_limit = 50 # ์ตœ๋Œ€ 50๊ฐœ ์ฑ„ํŒ…๋ฐฉ ์บ์‹œ
78
+
79
+ # ํ†ต๊ณ„
80
+ self.total_rooms = 0
81
+ self.total_documents = 0
82
+
83
+ logger.info(f"๐Ÿš€ RoomContextManager ์ดˆ๊ธฐํ™”: {self.storage_dir}")
84
+
85
+ def get_room_context(self, room_id: str) -> RoomContext:
86
+ """์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์กฐํšŒ (์บ์‹œ ์šฐ์„ )"""
87
+ # ์บ์‹œ์—์„œ ๋จผ์ € ํ™•์ธ
88
+ if room_id in self.room_cache:
89
+ logger.debug(f"๐Ÿ“‹ ์บ์‹œ์—์„œ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์กฐํšŒ: {room_id}")
90
+ return self.room_cache[room_id]
91
+
92
+ # ํŒŒ์ผ์—์„œ ๋กœ๋“œ
93
+ context = self._load_context_from_file(room_id)
94
+ if context:
95
+ # ์บ์‹œ์— ์ถ”๊ฐ€
96
+ self._add_to_cache(room_id, context)
97
+ return context
98
+
99
+ # ์ƒˆ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
100
+ context = self._create_new_context(room_id)
101
+ self._add_to_cache(room_id, context)
102
+ return context
103
+
104
+ def add_document(self, room_id: str, document_info: Dict[str, Any]) -> bool:
105
+ """๋ฌธ์„œ ์ถ”๊ฐ€"""
106
+ try:
107
+ context = self.get_room_context(room_id)
108
+
109
+ # ์ƒˆ ๋ฌธ์„œ ์ƒ์„ฑ
110
+ document = RoomDocument(
111
+ document_id=document_info["document_id"],
112
+ filename=document_info["filename"],
113
+ uploaded_at=time.time(),
114
+ uploaded_by=document_info.get("uploaded_by", "unknown"),
115
+ document_type=document_info.get("document_type", "unknown"),
116
+ page_count=document_info.get("page_count", 0),
117
+ chunk_count=document_info.get("chunk_count", 0),
118
+ summary=document_info.get("summary"),
119
+ tags=document_info.get("tags", [])
120
+ )
121
+
122
+ # ๋ฌธ์„œ ์ถ”๊ฐ€
123
+ context.documents.append(document)
124
+ context.total_documents = len(context.documents)
125
+ context.last_updated = time.time()
126
+
127
+ # ํŒŒ์ผ์— ์ €์žฅ
128
+ self._save_context_to_file(context)
129
+
130
+ logger.info(f"๐Ÿ“„ ๋ฌธ์„œ ์ถ”๊ฐ€ ์™„๋ฃŒ: {room_id} - {document.filename}")
131
+ return True
132
+
133
+ except Exception as e:
134
+ logger.error(f"โŒ ๋ฌธ์„œ ์ถ”๊ฐ€ ์‹คํŒจ: {room_id} - {e}")
135
+ return False
136
+
137
+ def get_documents(self, room_id: str) -> List[RoomDocument]:
138
+ """์ฑ„ํŒ…๋ฐฉ์˜ ๋ชจ๋“  ๋ฌธ์„œ ๋ฐ˜ํ™˜"""
139
+ try:
140
+ context = self.get_room_context(room_id)
141
+ return context.documents
142
+
143
+ except Exception as e:
144
+ logger.error(f"โŒ ๋ฌธ์„œ ๋ชฉ๋ก ์กฐํšŒ ์‹คํŒจ: {room_id} - {e}")
145
+ return []
146
+
147
+ def find_document(self, room_id: str, document_id: str) -> Optional[RoomDocument]:
148
+ """ํŠน์ • ๋ฌธ์„œ ์ฐพ๊ธฐ"""
149
+ try:
150
+ context = self.get_room_context(room_id)
151
+
152
+ for doc in context.documents:
153
+ if doc.document_id == document_id:
154
+ return doc
155
+
156
+ return None
157
+
158
+ except Exception as e:
159
+ logger.error(f"โŒ ๋ฌธ์„œ ๊ฒ€์ƒ‰ ์‹คํŒจ: {room_id} - {document_id} - {e}")
160
+ return None
161
+
162
+ def update_conversation_summary(self, room_id: str, summary: str, topics: List[str] = None) -> bool:
163
+ """๋Œ€ํ™” ์š”์•ฝ ์—…๋ฐ์ดํŠธ"""
164
+ try:
165
+ context = self.get_room_context(room_id)
166
+
167
+ context.conversation_summary = summary
168
+ if topics:
169
+ context.key_topics = topics
170
+
171
+ context.last_updated = time.time()
172
+
173
+ self._save_context_to_file(context)
174
+ logger.info(f"๐Ÿ“ ๋Œ€ํ™” ์š”์•ฝ ์—…๋ฐ์ดํŠธ: {room_id} - {len(summary)} ๋ฌธ์ž")
175
+ return True
176
+
177
+ except Exception as e:
178
+ logger.error(f"โŒ ๋Œ€ํ™” ์š”์•ฝ ์—…๋ฐ์ดํŠธ ์‹คํŒจ: {room_id} - {e}")
179
+ return False
180
+
181
+ def increment_message_count(self, room_id: str) -> bool:
182
+ """๋ฉ”์‹œ์ง€ ์ˆ˜ ์ฆ๊ฐ€"""
183
+ try:
184
+ context = self.get_room_context(room_id)
185
+ context.total_messages += 1
186
+ context.last_updated = time.time()
187
+
188
+ self._save_context_to_file(context)
189
+ return True
190
+
191
+ except Exception as e:
192
+ logger.error(f"โŒ ๋ฉ”์‹œ์ง€ ์ˆ˜ ์ฆ๊ฐ€ ์‹คํŒจ: {room_id} - {e}")
193
+ return False
194
+
195
+ def add_participant(self, room_id: str, user_id: str) -> bool:
196
+ """์ฐธ๊ฐ€์ž ์ถ”๊ฐ€"""
197
+ try:
198
+ context = self.get_room_context(room_id)
199
+
200
+ if user_id not in context.participants:
201
+ context.participants.append(user_id)
202
+ context.last_updated = time.time()
203
+
204
+ self._save_context_to_file(context)
205
+ logger.info(f"๐Ÿ‘ฅ ์ฐธ๊ฐ€์ž ์ถ”๊ฐ€: {room_id} - {user_id}")
206
+ return True
207
+
208
+ return False
209
+
210
+ except Exception as e:
211
+ logger.error(f"โŒ ์ฐธ๊ฐ€์ž ์ถ”๊ฐ€ ์‹คํŒจ: {room_id} - {user_id} - {e}")
212
+ return False
213
+
214
+ def set_room_info(self, room_id: str, name: str = None, description: str = None) -> bool:
215
+ """์ฑ„ํŒ…๋ฐฉ ์ •๋ณด ์„ค์ •"""
216
+ try:
217
+ context = self.get_room_context(room_id)
218
+
219
+ if name:
220
+ context.room_name = name
221
+ if description:
222
+ context.description = description
223
+
224
+ context.last_updated = time.time()
225
+
226
+ self._save_context_to_file(context)
227
+ logger.info(f"๐Ÿท๏ธ ์ฑ„ํŒ…๋ฐฉ ์ •๋ณด ์„ค์ •: {room_id} - {name}")
228
+ return True
229
+
230
+ except Exception as e:
231
+ logger.error(f"โŒ ์ฑ„ํŒ…๋ฐฉ ์ •๋ณด ์„ค์ • ์‹คํŒจ: {room_id} - {e}")
232
+ return False
233
+
234
+ def get_context_summary(self, room_id: str) -> Dict[str, Any]:
235
+ """์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์š”์•ฝ ๋ฐ˜ํ™˜"""
236
+ try:
237
+ context = self.get_room_context(room_id)
238
+
239
+ return {
240
+ "room_id": context.room_id,
241
+ "room_name": context.room_name,
242
+ "description": context.description,
243
+ "participants": context.participants,
244
+ "total_documents": len(context.documents),
245
+ "total_messages": context.total_messages,
246
+ "conversation_summary": context.conversation_summary,
247
+ "key_topics": context.key_topics,
248
+ "last_updated": context.last_updated,
249
+ "documents": [
250
+ {
251
+ "document_id": doc.document_id if hasattr(doc, 'document_id') else doc.get('document_id', 'unknown'),
252
+ "filename": doc.filename if hasattr(doc, 'filename') else doc.get('filename', 'unknown'),
253
+ "uploaded_at": doc.uploaded_at if hasattr(doc, 'uploaded_at') else doc.get('uploaded_at', 0),
254
+ "document_type": doc.document_type if hasattr(doc, 'document_type') else doc.get('document_type', 'unknown'),
255
+ "page_count": doc.page_count if hasattr(doc, 'page_count') else doc.get('page_count', 0),
256
+ "chunk_count": doc.chunk_count if hasattr(doc, 'chunk_count') else doc.get('chunk_count', 0),
257
+ "summary": doc.summary if hasattr(doc, 'summary') else doc.get('summary', '')
258
+ }
259
+ for doc in context.documents[-5:] # ์ตœ๊ทผ 5๊ฐœ๋งŒ
260
+ ]
261
+ }
262
+
263
+ except Exception as e:
264
+ logger.error(f"โŒ ์ปจํ…์ŠคํŠธ ์š”์•ฝ ์ƒ์„ฑ ์‹คํŒจ: {room_id} - {e}")
265
+ return {}
266
+
267
+ def _create_new_context(self, room_id: str) -> RoomContext:
268
+ """์ƒˆ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ"""
269
+ context = RoomContext(
270
+ room_id=room_id,
271
+ created_at=time.time(),
272
+ last_updated=time.time()
273
+ )
274
+
275
+ # ๊ธฐ๋ณธ ์„ค์ •
276
+ context.room_name = f"์ฑ„ํŒ…๋ฐฉ {room_id}"
277
+ context.description = "์ƒˆ๋กœ ์ƒ์„ฑ๋œ ์ฑ„ํŒ…๋ฐฉ์ž…๋‹ˆ๋‹ค."
278
+
279
+ # ํŒŒ์ผ์— ์ €์žฅ
280
+ self._save_context_to_file(context)
281
+
282
+ logger.info(f"๐Ÿ†• ์ƒˆ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ: {room_id}")
283
+ return context
284
+
285
+ def _save_context_to_file(self, context: RoomContext) -> bool:
286
+ """์ปจํ…์ŠคํŠธ๋ฅผ ํŒŒ์ผ์— ์ €์žฅ"""
287
+ try:
288
+ file_path = self.storage_dir / f"{context.room_id}.json"
289
+
290
+ with open(file_path, 'w', encoding='utf-8') as f:
291
+ json.dump(asdict(context), f, ensure_ascii=False, indent=2)
292
+
293
+ return True
294
+
295
+ except Exception as e:
296
+ logger.error(f"โŒ ์ปจํ…์ŠคํŠธ ํŒŒ์ผ ์ €์žฅ ์‹คํŒจ: {context.room_id} - {e}")
297
+ return False
298
+
299
+ def _load_context_from_file(self, room_id: str) -> Optional[RoomContext]:
300
+ """ํŒŒ์ผ์—์„œ ์ปจํ…์ŠคํŠธ ๋กœ๋“œ"""
301
+ try:
302
+ file_path = self.storage_dir / f"{room_id}.json"
303
+
304
+ if not file_path.exists():
305
+ return None
306
+
307
+ with open(file_path, 'r', encoding='utf-8') as f:
308
+ data = json.load(f)
309
+
310
+ # documents ๋ฆฌ์ŠคํŠธ์˜ ๊ฐ ํ•ญ๋ชฉ์„ RoomDocument ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
311
+ if 'documents' in data and isinstance(data['documents'], list):
312
+ documents = []
313
+ for doc_data in data['documents']:
314
+ if isinstance(doc_data, dict):
315
+ # ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ RoomDocument ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
316
+ doc = RoomDocument(**doc_data)
317
+ documents.append(doc)
318
+ else:
319
+ # ์ด๋ฏธ RoomDocument ๊ฐ์ฒด์ธ ๊ฒฝ์šฐ
320
+ documents.append(doc_data)
321
+ data['documents'] = documents
322
+
323
+ # RoomContext ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
324
+ context = RoomContext(**data)
325
+ logger.debug(f"๐Ÿ“‚ ํŒŒ์ผ์—์„œ ์ปจํ…์ŠคํŠธ ๋กœ๋“œ: {room_id}")
326
+ return context
327
+
328
+ except Exception as e:
329
+ logger.error(f"โŒ ์ปจํ…์ŠคํŠธ ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {room_id} - {e}")
330
+ return None
331
+
332
+ def _add_to_cache(self, room_id: str, context: RoomContext):
333
+ """์บ์‹œ์— ์ปจํ…์ŠคํŠธ ์ถ”๊ฐ€ (ํฌ๊ธฐ ์ œํ•œ ํ™•์ธ)"""
334
+ if len(self.room_cache) >= self.cache_size_limit:
335
+ # ๊ฐ€์žฅ ์˜ค๋ž˜๋œ ์ปจํ…์ŠคํŠธ ์ œ๊ฑฐ (LRU ๋ฐฉ์‹)
336
+ oldest_room = min(self.room_cache.keys(),
337
+ key=lambda k: self.room_cache[k].last_updated)
338
+ del self.room_cache[oldest_room]
339
+ logger.debug(f"๐Ÿ—‘๏ธ ์บ์‹œ์—์„œ ์˜ค๋ž˜๋œ ์ปจํ…์ŠคํŠธ ์ œ๊ฑฐ: {oldest_room}")
340
+
341
+ self.room_cache[room_id] = context
342
+
343
+ def get_all_rooms(self) -> List[str]:
344
+ """๋ชจ๋“  ์ฑ„ํŒ…๋ฐฉ ID ๋ฐ˜ํ™˜"""
345
+ try:
346
+ room_files = list(self.storage_dir.glob("*.json"))
347
+ room_ids = [f.stem for f in room_files]
348
+ return room_ids
349
+
350
+ except Exception as e:
351
+ logger.error(f"โŒ ์ฑ„ํŒ…๋ฐฉ ๋ชฉ๋ก ์กฐํšŒ ์‹คํŒจ: {e}")
352
+ return []
353
+
354
+ def delete_room_context(self, room_id: str) -> bool:
355
+ """์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์‚ญ์ œ"""
356
+ try:
357
+ # ์บ์‹œ์—์„œ ์ œ๊ฑฐ
358
+ if room_id in self.room_cache:
359
+ del self.room_cache[room_id]
360
+
361
+ # ํŒŒ์ผ ์‚ญ์ œ
362
+ file_path = self.storage_dir / f"{room_id}.json"
363
+ if file_path.exists():
364
+ file_path.unlink()
365
+
366
+ logger.info(f"๐Ÿ—‘๏ธ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์‚ญ์ œ ์™„๋ฃŒ: {room_id}")
367
+ return True
368
+
369
+ except Exception as e:
370
+ logger.error(f"โŒ ์ฑ„ํŒ…๋ฐฉ ์ปจํ…์ŠคํŠธ ์‚ญ์ œ ์‹คํŒจ: {room_id} - {e}")
371
+ return False
372
+
373
+ # ์ „์—ญ ์ธ์Šคํ„ด์Šค
374
+ room_context_manager = RoomContextManager()
lily_llm_core/user_memory_manager.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ์‚ฌ์šฉ์ž๋ณ„ ์ „์—ญ ์žฅ๊ธฐ ๊ธฐ์–ต ๊ด€๋ฆฌ์ž (User Memory Manager)
4
+ ChatGPT์˜ '๋ฉ”๋ชจ๋ฆฌ' ๊ธฐ๋Šฅ๊ณผ ์œ ์‚ฌํ•œ ์‚ฌ์šฉ์ž๋ณ„ ์ „์—ญ ์ •๋ณด ์ €์žฅ ์‹œ์Šคํ…œ
5
+ """
6
+
7
+ import logging
8
+ import time
9
+ import json
10
+ import os
11
+ from typing import Dict, Any, List, Optional, Tuple
12
+ from dataclasses import dataclass, asdict
13
+ from pathlib import Path
14
+ import hashlib
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ @dataclass
19
+ class UserMemory:
20
+ """์‚ฌ์šฉ์ž๋ณ„ ์ „์—ญ ๋ฉ”๋ชจ๋ฆฌ ๋ฐ์ดํ„ฐ"""
21
+ user_id: str
22
+ created_at: float
23
+ last_updated: float
24
+
25
+ # ๊ธฐ๋ณธ ์ •๋ณด
26
+ name: Optional[str] = None
27
+ preferences: Dict[str, Any] = None
28
+ important_info: List[str] = None
29
+
30
+ # ๋Œ€ํ™” ํŒจํ„ด ๋ฐ ์„ ํ˜ธ๋„
31
+ communication_style: Optional[str] = None
32
+ language_preference: Optional[str] = None
33
+ response_length_preference: Optional[str] = None
34
+
35
+ # ์ „๋ฌธ ๋ถ„์•ผ ๋ฐ ๊ด€์‹ฌ์‚ฌ
36
+ expertise_areas: List[str] = None
37
+ interests: List[str] = None
38
+
39
+ # ์‚ฌ์šฉ ํŒจํ„ด ํ†ต๊ณ„
40
+ total_conversations: int = 0
41
+ total_messages: int = 0
42
+ favorite_topics: List[str] = None
43
+
44
+ # AI ์„ค์ • ๋ฐ ์„ ํ˜ธ๋„
45
+ ai_personality: Optional[str] = None
46
+ ai_response_style: Optional[str] = None
47
+
48
+ def __post_init__(self):
49
+ if self.preferences is None:
50
+ self.preferences = {}
51
+ if self.important_info is None:
52
+ self.important_info = []
53
+ if self.expertise_areas is None:
54
+ self.expertise_areas = []
55
+ if self.interests is None:
56
+ self.interests = []
57
+ if self.favorite_topics is None:
58
+ self.favorite_topics = []
59
+
60
+ class UserMemoryManager:
61
+ """์‚ฌ์šฉ์ž๋ณ„ ์ „์—ญ ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ์ž"""
62
+
63
+ def __init__(self, storage_dir: str = "user_memories"):
64
+ self.storage_dir = Path(storage_dir)
65
+ self.storage_dir.mkdir(exist_ok=True)
66
+
67
+ # ๋ฉ”๋ชจ๋ฆฌ ์บ์‹œ (๋ฉ”๋ชจ๋ฆฌ ํšจ์œจ์„ฑ)
68
+ self.memory_cache: Dict[str, UserMemory] = {}
69
+ self.cache_size_limit = 100 # ์ตœ๋Œ€ 100๋ช…์˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ์บ์‹œ
70
+
71
+ # ๋ฉ”๋ชจ๋ฆฌ ํ†ต๊ณ„
72
+ self.total_users = 0
73
+ self.total_memories = 0
74
+
75
+ logger.info(f"๐Ÿš€ UserMemoryManager ์ดˆ๊ธฐํ™”: {self.storage_dir}")
76
+
77
+ def get_user_memory(self, user_id: str) -> UserMemory:
78
+ """์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์กฐํšŒ (์บ์‹œ ์šฐ์„ )"""
79
+ # ์บ์‹œ์—์„œ ๋จผ์ € ํ™•์ธ
80
+ if user_id in self.memory_cache:
81
+ logger.debug(f"๐Ÿ“‹ ์บ์‹œ์—์„œ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์กฐํšŒ: {user_id}")
82
+ return self.memory_cache[user_id]
83
+
84
+ # ํŒŒ์ผ์—์„œ ๋กœ๋“œ
85
+ memory = self._load_memory_from_file(user_id)
86
+ if memory:
87
+ # ์บ์‹œ์— ์ถ”๊ฐ€ (์บ์‹œ ํฌ๊ธฐ ์ œํ•œ ํ™•์ธ)
88
+ self._add_to_cache(user_id, memory)
89
+ return memory
90
+
91
+ # ์ƒˆ ๋ฉ”๋ชจ๋ฆฌ ์ƒ์„ฑ
92
+ memory = self._create_new_memory(user_id)
93
+ self._add_to_cache(user_id, memory)
94
+ return memory
95
+
96
+ def update_user_memory(self, user_id: str, updates: Dict[str, Any]) -> bool:
97
+ """์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์—…๋ฐ์ดํŠธ"""
98
+ try:
99
+ memory = self.get_user_memory(user_id)
100
+
101
+ # ์—…๋ฐ์ดํŠธ ์ ์šฉ
102
+ for key, value in updates.items():
103
+ if hasattr(memory, key):
104
+ setattr(memory, key, value)
105
+ logger.debug(f"๐Ÿ“ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์—…๋ฐ์ดํŠธ: {user_id}.{key} = {value}")
106
+
107
+ # ํƒ€์ž„์Šคํƒฌํ”„ ์—…๋ฐ์ดํŠธ
108
+ memory.last_updated = time.time()
109
+
110
+ # ํŒŒ์ผ์— ์ €์žฅ
111
+ self._save_memory_to_file(memory)
112
+
113
+ # ์บ์‹œ ์—…๋ฐ์ดํŠธ
114
+ if user_id in self.memory_cache:
115
+ self.memory_cache[user_id] = memory
116
+
117
+ logger.info(f"โœ… ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์—…๋ฐ์ดํŠธ ์™„๋ฃŒ: {user_id}")
118
+ return True
119
+
120
+ except Exception as e:
121
+ logger.error(f"โŒ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์—…๋ฐ์ดํŠธ ์‹คํŒจ: {user_id} - {e}")
122
+ return False
123
+
124
+ def add_important_info(self, user_id: str, info: str) -> bool:
125
+ """์ค‘์š” ์ •๋ณด ์ถ”๊ฐ€"""
126
+ try:
127
+ memory = self.get_user_memory(user_id)
128
+
129
+ if info not in memory.important_info:
130
+ memory.important_info.append(info)
131
+ memory.last_updated = time.time()
132
+
133
+ self._save_memory_to_file(memory)
134
+ logger.info(f"๐Ÿ“Œ ์ค‘์š” ์ •๋ณด ์ถ”๊ฐ€: {user_id} - {info[:50]}...")
135
+ return True
136
+
137
+ return False
138
+
139
+ except Exception as e:
140
+ logger.error(f"โŒ ์ค‘์š” ์ •๋ณด ์ถ”๊ฐ€ ์‹คํŒจ: {user_id} - {e}")
141
+ return False
142
+
143
+ def update_preferences(self, user_id: str, preferences: Dict[str, Any]) -> bool:
144
+ """์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„ ์—…๋ฐ์ดํŠธ"""
145
+ try:
146
+ memory = self.get_user_memory(user_id)
147
+
148
+ # ๊ธฐ์กด ์„ ํ˜ธ๋„์™€ ๋ณ‘ํ•ฉ
149
+ memory.preferences.update(preferences)
150
+ memory.last_updated = time.time()
151
+
152
+ self._save_memory_to_file(memory)
153
+ logger.info(f"โš™๏ธ ์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„ ์—…๋ฐ์ดํŠธ: {user_id} - {len(preferences)}๊ฐœ ํ•ญ๋ชฉ")
154
+ return True
155
+
156
+ except Exception as e:
157
+ logger.error(f"โŒ ์‚ฌ์šฉ์ž ์„ ํ˜ธ๋„ ์—…๋ฐ์ดํŠธ ์‹คํŒจ: {user_id} - {e}")
158
+ return False
159
+
160
+ def record_conversation(self, user_id: str, topic: str = None) -> bool:
161
+ """๋Œ€ํ™” ๊ธฐ๋ก (ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ)"""
162
+ try:
163
+ memory = self.get_user_memory(user_id)
164
+
165
+ memory.total_conversations += 1
166
+ memory.total_messages += 1
167
+
168
+ if topic and topic not in memory.favorite_topics:
169
+ memory.favorite_topics.append(topic)
170
+ # ์ตœ๋Œ€ 10๊ฐœ๊นŒ์ง€๋งŒ ์œ ์ง€
171
+ if len(memory.favorite_topics) > 10:
172
+ memory.favorite_topics = memory.favorite_topics[-10:]
173
+
174
+ memory.last_updated = time.time()
175
+
176
+ self._save_memory_to_file(memory)
177
+ return True
178
+
179
+ except Exception as e:
180
+ logger.error(f"โŒ ๋Œ€ํ™” ๊ธฐ๋ก ์‹คํŒจ: {user_id} - {e}")
181
+ return False
182
+
183
+ def get_memory_summary(self, user_id: str) -> Dict[str, Any]:
184
+ """์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์š”์•ฝ ๋ฐ˜ํ™˜"""
185
+ try:
186
+ memory = self.get_user_memory(user_id)
187
+
188
+ return {
189
+ "user_id": memory.user_id,
190
+ "name": memory.name,
191
+ "preferences": memory.preferences,
192
+ "important_info": memory.important_info[:5], # ์ตœ๊ทผ 5๊ฐœ๋งŒ
193
+ "expertise_areas": memory.expertise_areas,
194
+ "interests": memory.interests,
195
+ "communication_style": memory.communication_style,
196
+ "ai_personality": memory.ai_personality,
197
+ "total_conversations": memory.total_conversations,
198
+ "favorite_topics": memory.favorite_topics[-5:], # ์ตœ๊ทผ 5๊ฐœ๋งŒ
199
+ "last_updated": memory.last_updated
200
+ }
201
+
202
+ except Exception as e:
203
+ logger.error(f"โŒ ๋ฉ”๋ชจ๋ฆฌ ์š”์•ฝ ์ƒ์„ฑ ์‹คํŒจ: {user_id} - {e}")
204
+ return {}
205
+
206
+ def _create_new_memory(self, user_id: str) -> UserMemory:
207
+ """์ƒˆ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์ƒ์„ฑ"""
208
+ memory = UserMemory(
209
+ user_id=user_id,
210
+ created_at=time.time(),
211
+ last_updated=time.time()
212
+ )
213
+
214
+ # ๊ธฐ๋ณธ ์„ค์ •
215
+ memory.language_preference = "ko"
216
+ memory.response_length_preference = "medium"
217
+ memory.ai_personality = "friendly"
218
+ memory.ai_response_style = "helpful"
219
+
220
+ # ํŒŒ์ผ์— ์ €์žฅ
221
+ self._save_memory_to_file(memory)
222
+
223
+ logger.info(f"๐Ÿ†• ์ƒˆ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์ƒ์„ฑ: {user_id}")
224
+ return memory
225
+
226
+ def _save_memory_to_file(self, memory: UserMemory) -> bool:
227
+ """๋ฉ”๋ชจ๋ฆฌ๋ฅผ ํŒŒ์ผ์— ์ €์žฅ"""
228
+ try:
229
+ file_path = self.storage_dir / f"{memory.user_id}.json"
230
+
231
+ with open(file_path, 'w', encoding='utf-8') as f:
232
+ json.dump(asdict(memory), f, ensure_ascii=False, indent=2)
233
+
234
+ return True
235
+
236
+ except Exception as e:
237
+ logger.error(f"โŒ ๋ฉ”๋ชจ๋ฆฌ ํŒŒ์ผ ์ €์žฅ ์‹คํŒจ: {memory.user_id} - {e}")
238
+ return False
239
+
240
+ def _load_memory_from_file(self, user_id: str) -> Optional[UserMemory]:
241
+ """ํŒŒ์ผ์—์„œ ๋ฉ”๋ชจ๋ฆฌ ๋กœ๋“œ"""
242
+ try:
243
+ file_path = self.storage_dir / f"{user_id}.json"
244
+
245
+ if not file_path.exists():
246
+ return None
247
+
248
+ with open(file_path, 'r', encoding='utf-8') as f:
249
+ data = json.load(f)
250
+
251
+ # UserMemory ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
252
+ memory = UserMemory(**data)
253
+ logger.debug(f"๐Ÿ“‚ ํŒŒ์ผ์—์„œ ๋ฉ”๋ชจ๋ฆฌ ๋กœ๋“œ: {user_id}")
254
+ return memory
255
+
256
+ except Exception as e:
257
+ logger.error(f"โŒ ๋ฉ”๋ชจ๋ฆฌ ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {user_id} - {e}")
258
+ return None
259
+
260
+ def _add_to_cache(self, user_id: str, memory: UserMemory):
261
+ """์บ์‹œ์— ๋ฉ”๋ชจ๋ฆฌ ์ถ”๊ฐ€ (ํฌ๊ธฐ ์ œํ•œ ํ™•์ธ)"""
262
+ if len(self.memory_cache) >= self.cache_size_limit:
263
+ # ๊ฐ€์žฅ ์˜ค๋ž˜๋œ ๋ฉ”๋ชจ๋ฆฌ ์ œ๊ฑฐ (LRU ๋ฐฉ์‹)
264
+ oldest_user = min(self.memory_cache.keys(),
265
+ key=lambda k: self.memory_cache[k].last_updated)
266
+ del self.memory_cache[oldest_user]
267
+ logger.debug(f"๐Ÿ—‘๏ธ ์บ์‹œ์—์„œ ์˜ค๋ž˜๋œ ๋ฉ”๋ชจ๋ฆฌ ์ œ๊ฑฐ: {oldest_user}")
268
+
269
+ self.memory_cache[user_id] = memory
270
+
271
+ def get_all_users(self) -> List[str]:
272
+ """๋ชจ๋“  ์‚ฌ์šฉ์ž ID ๋ฐ˜ํ™˜"""
273
+ try:
274
+ user_files = list(self.storage_dir.glob("*.json"))
275
+ user_ids = [f.stem for f in user_files]
276
+ return user_ids
277
+
278
+ except Exception as e:
279
+ logger.error(f"โŒ ์‚ฌ์šฉ์ž ๋ชฉ๋ก ์กฐํšŒ ์‹คํŒจ: {e}")
280
+ return []
281
+
282
+ def delete_user_memory(self, user_id: str) -> bool:
283
+ """์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์‚ญ์ œ"""
284
+ try:
285
+ # ์บ์‹œ์—์„œ ์ œ๊ฑฐ
286
+ if user_id in self.memory_cache:
287
+ del self.memory_cache[user_id]
288
+
289
+ # ํŒŒ์ผ ์‚ญ์ œ
290
+ file_path = self.storage_dir / f"{user_id}.json"
291
+ if file_path.exists():
292
+ file_path.unlink()
293
+
294
+ logger.info(f"๐Ÿ—‘๏ธ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์‚ญ์ œ ์™„๋ฃŒ: {user_id}")
295
+ return True
296
+
297
+ except Exception as e:
298
+ logger.error(f"โŒ ์‚ฌ์šฉ์ž ๋ฉ”๋ชจ๋ฆฌ ์‚ญ์ œ ์‹คํŒจ: {user_id} - {e}")
299
+ return False
300
+
301
+ # ์ „์—ญ ์ธ์Šคํ„ด์Šค
302
+ user_memory_manager = UserMemoryManager()
room_contexts/default.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "room_id": "default",
3
+ "created_at": 1755887199.998416,
4
+ "last_updated": 1755888375.434728,
5
+ "room_name": "์ฑ„ํŒ…๋ฐฉ default",
6
+ "description": "์ƒˆ๋กœ ์ƒ์„ฑ๋œ ์ฑ„ํŒ…๋ฐฉ์ž…๋‹ˆ๋‹ค.",
7
+ "participants": [
8
+ "kdy"
9
+ ],
10
+ "documents": [
11
+ {
12
+ "document_id": "32653a11",
13
+ "filename": "oop์ปคํ”ผ๋งค๋‹ˆ์ €.pdf",
14
+ "uploaded_at": 1755887816.48926,
15
+ "uploaded_by": "kdy",
16
+ "document_type": "pdf",
17
+ "page_count": 0,
18
+ "chunk_count": 0,
19
+ "summary": "๋ฌธ์„œ๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
20
+ "tags": []
21
+ },
22
+ {
23
+ "document_id": "8991b80f",
24
+ "filename": "test_math.pdf",
25
+ "uploaded_at": 1755887939.8704937,
26
+ "uploaded_by": "kdy",
27
+ "document_type": "pdf",
28
+ "page_count": 0,
29
+ "chunk_count": 0,
30
+ "summary": "๋ฌธ์„œ๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
31
+ "tags": []
32
+ },
33
+ {
34
+ "document_id": "8ed78561",
35
+ "filename": "test_design.pdf",
36
+ "uploaded_at": 1755888122.0620363,
37
+ "uploaded_by": "kdy",
38
+ "document_type": "pdf",
39
+ "page_count": 0,
40
+ "chunk_count": 0,
41
+ "summary": "๋ฌธ์„œ๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
42
+ "tags": []
43
+ },
44
+ {
45
+ "document_id": "c737d6d1",
46
+ "filename": "test_design.pdf",
47
+ "uploaded_at": 1755888375.4331617,
48
+ "uploaded_by": "kdy",
49
+ "document_type": "pdf",
50
+ "page_count": 0,
51
+ "chunk_count": 0,
52
+ "summary": "๋ฌธ์„œ๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
53
+ "tags": []
54
+ }
55
+ ],
56
+ "conversation_summary": null,
57
+ "key_topics": [],
58
+ "total_messages": 4,
59
+ "ai_context": {}
60
+ }
user_memories/anonymous.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "user_id": "anonymous",
3
+ "created_at": 1755887199.9962583,
4
+ "last_updated": 1755887199.9962583,
5
+ "name": null,
6
+ "preferences": {},
7
+ "important_info": [],
8
+ "communication_style": null,
9
+ "language_preference": "ko",
10
+ "response_length_preference": "medium",
11
+ "expertise_areas": [],
12
+ "interests": [],
13
+ "total_conversations": 0,
14
+ "total_messages": 0,
15
+ "favorite_topics": [],
16
+ "ai_personality": "friendly",
17
+ "ai_response_style": "helpful"
18
+ }
user_memories/kdy.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "user_id": "kdy",
3
+ "created_at": 1755887816.4909644,
4
+ "last_updated": 1755888375.434728,
5
+ "name": null,
6
+ "preferences": {},
7
+ "important_info": [],
8
+ "communication_style": null,
9
+ "language_preference": "ko",
10
+ "response_length_preference": "medium",
11
+ "expertise_areas": [],
12
+ "interests": [],
13
+ "total_conversations": 8,
14
+ "total_messages": 8,
15
+ "favorite_topics": [
16
+ "๋ฌธ์„œ ์—…๋กœ๋“œ: oop์ปคํ”ผ๋งค๋‹ˆ์ €.pdf",
17
+ "๋ฌธ์„œ ์—…๋กœ๋“œ: test_math.pdf",
18
+ "๋ฌธ์„œ ์—…๋กœ๋“œ: test_design.pdf"
19
+ ],
20
+ "ai_personality": "friendly",
21
+ "ai_response_style": "helpful"
22
+ }