diff --git "a/lily_llm_api/app.py" "b/lily_llm_api/app.py"
--- "a/lily_llm_api/app.py"
+++ "b/lily_llm_api/app.py"
@@ -1,246 +1,4468 @@
 #!/usr/bin/env python3
 """
-Lily LLM API 서버
-파인튜닝된 Mistral-7B 모델을 RESTful API로 서빙
+Lily LLM API 서버 v2 (인터랙티브 선택 복원 및 성능 최적화 최종본)
 """
-
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException, Request, UploadFile, File, Form, Depends, WebSocket, WebSocketDisconnect
+from fastapi.security import HTTPAuthorizationCredentials
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import uvicorn
 import logging
 import time
 import torch
-from typing import Optional, List
+from datetime import datetime
+from typing import Optional, List, Union
+import asyncio
+import concurrent.futures
+import sys
+from PIL import Image
+import io
+import os
+import json
+from pathlib import Path
+import warnings
+
+# 🔄 RoPE 경고 숨기기 (Kanana 모델 내부 구현 관련)
+warnings.filterwarnings("ignore", message="The attention layers in this model are transitioning")
+warnings.filterwarnings("ignore", message="rotary_pos_emb will be removed")
+warnings.filterwarnings("ignore", message="position_embeddings will be mandatory")
 
-# 로깅 설정
-logging.basicConfig(level=logging.INFO)
+# logging 설정을 먼저 구성
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
-# FastAPI 앱 생성
+from .models import get_model_profile, list_available_models
+from lily_llm_core.rag_processor import rag_processor
+from lily_llm_core.document_processor import document_processor
+from lily_llm_core.hybrid_prompt_generator import hybrid_prompt_generator
+from lily_llm_core.database import db_manager
+from lily_llm_core.auth_manager import auth_manager
+from lily_llm_core.websocket_manager import connection_manager
+from lily_llm_core.celery_app import (
+    process_document_async, generate_ai_response_async, 
+    rag_query_async, batch_process_documents_async,
+    get_task_status, cancel_task
+)
+from lily_llm_core.performance_monitor import performance_monitor
+
+# 이미지 OCR 전용 모듈 추가
+from lily_llm_core.image_rag_processor import image_rag_processor
+from lily_llm_core.latex_rag_processor import latex_rag_processor
+from lily_llm_core.vector_store_manager import vector_store_manager
+
+# LaTeX-OCR + FAISS 통합 시스템 추가
+# from latex_ocr_faiss_integrated import LatexOCRFAISSIntegrated
+# from latex_ocr_faiss_simple import LatexOCRFAISSSimple
+
+# 멀티모달 RAG 프로세서 추가
+from lily_llm_core.hybrid_rag_processor import hybrid_rag_processor
+
+# 컨텍스트 관리자 및 LoRA 관리자 추가
+from lily_llm_core.context_manager import get_context_manager, context_manager
+
+# 계층적 메모리 시스템 추가
+from lily_llm_core.integrated_memory_manager import integrated_memory_manager
+from lily_llm_core.text_summarizer import text_summarizer, SummaryConfig
+
+# 전역 변수들
+current_model = None  # 🔄 현재 로드된 모델 인스턴스
+current_profile = None  # 🔄 현재 선택된 모델 프로필
+model_loaded = False  # 🔄 모델 로드 상태
+
+# LoRA 관리자 import (선택적)
+try:
+    from lily_llm_core.lora_manager import get_lora_manager, lora_manager
+    LORA_AVAILABLE = True
+    logger.info("✅ LoRA 관리자 import 성공")
+except ImportError as e:
+    logger.warning(f"⚠️ LoRA 관리자 import 실패: {e}")
+    LORA_AVAILABLE = False
+    lora_manager = None
+    get_lora_manager = None
+
+# ===== 공통 LoRA 설정 함수 =====
+def setup_lora_for_model(profile, lora_manager):
+    """모델 프로필에 따른 LoRA 설정 (공통 함수)"""
+    if not LORA_AVAILABLE or not lora_manager:
+        logger.warning("⚠️ LoRA가 사용 불가능하여 자동 설정 건너뜀")
+        return False
+    
+    try:
+        logger.info("🔧 LoRA 자동 설정 시작...")
+        
+        # 🔄 모델 프로필에서 경로 및 타입 정보 가져오기
+        current_model_path = None
+        model_type = "causal_lm"  # 기본값
+        
+        # 🔄 모델 프로필에서 경로 및 타입 정보 가져오기
+        if hasattr(profile, 'local_path') and profile.local_path:
+            # 로컬 환경: 로컬 경로 사용
+            current_model_path = profile.local_path
+            # 🔄 local_path 사용 시에도 model_type 설정 필요
+            if hasattr(profile, 'model_id') and profile.model_id:
+                model_id = profile.model_id
+                if model_id == "kanana-1.5-v-3b-instruct":
+                    model_type = "vision2seq"  # 🔄 kanana는 vision2seq 타입
+                else:
+                    model_type = "causal_lm"  # 기본값
+            logger.info(f"🔍 모델 프로필에서 로컬 경로 사용: {current_model_path}")
+            logger.info(f"🔍 결정된 모델 타입: {model_type}")
+        elif hasattr(profile, 'model_id') and profile.model_id:
+            # 모델 ID를 기반으로 경로 결정
+            model_id = profile.model_id
+            logger.info(f"🔍 모델 ID 기반 경로 결정: {model_id}")
+            
+            # 🔄 환경에 따른 경로 결정
+            if hasattr(profile, 'is_local') and profile.is_local:
+                # 로컬 환경: 로컬 경로 사용
+                if model_id == "polyglot-ko-1.3b-chat":
+                    current_model_path = "./lily_llm_core/models/polyglot_ko_1_3b_chat"
+                    model_type = "causal_lm"
+                elif model_id == "kanana-1.5-v-3b-instruct":
+                    current_model_path = "./lily_llm_core/models/kanana_1_5_v_3b_instruct"
+                    model_type = "vision2seq"  # 🔄 kanana는 vision2seq 타입
+                elif model_id == "polyglot-ko-5.8b-chat":
+                    current_model_path = "./lily_llm_core/models/polyglot_ko_5_8b_chat"
+                    model_type = "causal_lm"
+            else:
+                # 배포 환경: HF 모델명 사용 (로컬 경로 없음)
+                current_model_path = None
+                logger.info(f"🔍 배포 환경: LoRA 설정 건너뜀 (HF 모델)")
+                return False
+            
+            logger.info(f"🔍 결정된 모델 경로: {current_model_path}")
+            logger.info(f"🔍 결정된 모델 타입: {model_type}")
+        
+        if not current_model_path:
+            logger.warning("⚠️ 현재 모델의 경로를 찾을 수 없어 LoRA 자동 로드 건너뜀")
+            return False
+        
+        logger.info(f"🔍 LoRA 모델 경로: {current_model_path}")
+        logger.info(f"🔍 LoRA 모델 타입: {model_type}")
+        
+        # 🔄 이미 로드된 메인 모델을 LoRA에 직접 적용 (중복 로드 방지)
+        logger.info("🔧 기존 메인 모델에 LoRA 직접 적용 시작...")
+        
+        # 🔄 lora_manager에 기존 메인 모델 설정
+        if hasattr(lora_manager, 'base_model') and lora_manager.base_model is None:
+            # 전역 변수에서 메인 모델 가져오기
+            from lily_llm_api.app import current_model
+            if current_model is not None:
+                lora_manager.base_model = current_model
+                logger.info("✅ 기존 메인 모델을 LoRA 관리자에 설정 완료")
+            else:
+                logger.warning("⚠️ 메인 모델을 찾을 수 없어 LoRA 설정 건너뜀")
+                return False
+        
+        # LoRA 설정 생성
+        logger.info("🔧 LoRA 설정 생성 시작...")
+        
+        # 🔄 모델별 target modules 설정
+        if model_type == "vision2seq" and "kanana" in profile.model_id:
+            # Kanana 모델: Llama 기반 language model 사용 (첫 번째 레이어만 사용)
+            target_modules = [
+                "language_model.model.layers.0.self_attn.q_proj",
+                "language_model.model.layers.0.self_attn.k_proj", 
+                "language_model.model.layers.0.self_attn.v_proj",
+                "language_model.model.layers.0.self_attn.o_proj",
+                "language_model.model.layers.0.mlp.gate_proj",
+                "language_model.model.layers.0.mlp.up_proj",
+                "language_model.model.layers.0.mlp.down_proj"
+            ]
+        else:
+            # 기존 모델들: GPTNeoX 기반
+            target_modules = ["query_key_value", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"]
+        
+        lora_config = lora_manager.create_lora_config(
+            r=16,
+            lora_alpha=32,
+            lora_dropout=0.1,
+            bias="none",
+            task_type="CAUSAL_LM" if model_type == "causal_lm" else "VISION_2_SEQ",
+            target_modules=target_modules
+        )
+        logger.info("✅ LoRA 설정 생성 완료")
+        
+        # LoRA 어댑터 적용 (기존 메인 모델에 직접)
+        logger.info("🔧 LoRA 어댑터 적용 시작...")
+        adapter_success = lora_manager.apply_lora_to_model("auto_adapter")
+        if adapter_success:
+            logger.info("✅ LoRA 어댑터 적용 완료: auto_adapter")
+            logger.info("🎉 LoRA 자동 설정 완료!")
+            return True
+        else:
+            logger.error("❌ LoRA 어댑터 적용 실패")
+            return False
+            
+    except Exception as e:
+        logger.error(f"❌ LoRA 자동 설정 중 오류: {e}")
+        return False
+
+# ===== lifespan 컨텍스트 매니저 (서버 시작/종료 이벤트) =====
+from contextlib import asynccontextmanager
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """서버 생명주기 관리"""
+    # 서버 시작 시
+    logger.info("🚀 서버 시작 이벤트 실행 중...")
+    
+    # CPU 스레드 최적화 적용
+    try:
+        configure_cpu_threads()
+        logger.info("✅ CPU 스레드 최적화 완료")
+    except Exception as e:
+        logger.error(f"❌ CPU 스레드 설정 실패: {e}")
+    
+    # 🔄 모델 선택 복원: 사용자가 모델을 선택할 수 있도록
+    selected_model_id = select_model_interactive()
+    logger.info(f"🚀 서�� 시작 시 선택된 모델: {selected_model_id}")
+    
+    try:
+        await load_model_async(selected_model_id)
+        global model_loaded
+        model_loaded = True
+        logger.info(f"✅ 서버가 '{current_profile.display_name}' 모델로 준비되었습니다.")
+        logger.info(f"✅ model_loaded 상태: {model_loaded}")
+        
+        # 🔄 실무용: 고급 컨텍스트 관리자 설정
+        try:
+            # 요약 방법을 smart로 설정 (가장 균형잡힌 요약)
+            context_manager.set_summary_method("smart")
+            logger.info("✅ 고급 컨텍스트 관리자 설정 완료: smart 요약 방법 활성화")
+            
+            # 자동 정리 설정 최적화
+            context_manager.set_auto_cleanup_config(
+                enabled=True,
+                interval_turns=5,      # 5턴마다 정리
+                interval_time=180,     # 3분마다 정리
+                strategy="aggressive"   # 적극적 정리로 메모리 최적화
+            )
+            logger.info("✅ 자동 정리 설정 최적화 완료")
+            
+        except Exception as e:
+            logger.warning(f"⚠️ 고급 컨텍스트 관리자 설정 실패: {e}")
+        
+        # 🔄 LoRA 자동 설정은 load_model_async 내부에서 이미 처리됨
+        # setup_lora_for_model(current_profile, lora_manager)  # 중복 호출 제거
+            
+    except Exception as e:
+        logger.error(f"❌ 모델 로드에 실패했습니다: {e}", exc_info=True)
+        model_loaded = False
+    
+    logger.info("✅ 서버 시작 이벤트 완료")
+    
+    yield  # 서버 실행 중
+    
+    # 서버 종료 시
+    logger.info("🛑 서버 종료 이벤트 실행 중...")
+    logger.info("✅ 서버 종료 이벤트 완료")
+
+# FastAPI 앱 생성 (lifespan 포함)
 app = FastAPI(
-    title="Lily LLM API",
-    description="Hearth Chat용 파인튜닝된 Mistral-7B 모델 API",
-    version="1.0.0"
+    title="Lily LLM API v2",
+    description="다중 모델 지원 LLM API 서버",
+    version="2.0.0",
+    lifespan=lifespan
 )
 
 # CORS 설정
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # 개발용, 프로덕션에서는 특정 도메인만 허용
+    allow_origins=[
+        "http://localhost:8001",
+        "http://127.0.0.1:8001",
+        "http://localhost:3000",
+        "http://127.0.0.1:3000",
+        "*"  # 개발 중에는 모든 origin 허용
+    ],
     allow_credentials=True,
-    allow_methods=["*"],
+    allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
     allow_headers=["*"],
 )
 
 # Pydantic 모델들
 class GenerateRequest(BaseModel):
     prompt: str
-    max_length: Optional[int] = 100
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 0.9
-    do_sample: Optional[bool] = True
+    model_id: Optional[str] = None  # 기본값 제거 - 현재 로드된 모델 사용
+    max_length: Optional[int] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    do_sample: Optional[bool] = None
 
 class GenerateResponse(BaseModel):
     generated_text: str
     processing_time: float
-    model_name: str = "Lily LLM (Mistral-7B)"
+    model_name: str
+    image_processed: bool
 
+class MultimodalGenerateResponse(BaseModel):
+    generated_text: str
+    processing_time: float
+    model_name: str
+    model_id: Optional[str] = None
+    image_processed: bool = False
+    
 class HealthResponse(BaseModel):
     status: str
     model_loaded: bool
-    model_name: str
+    current_model: str
+    available_models: List[dict]
+
+class DocumentUploadResponse(BaseModel):
+    success: bool
+    document_id: str
+    message: str
+    chunks: Optional[int] = None
+    latex_count: Optional[int] = None  # LaTeX 수식 개수 필드 추가
+    error: Optional[str] = None
+    auto_response: Optional[str] = None # 자동 응답 필드 추가
+
+class RAGResponse(BaseModel):
+    success: bool
+    response: str
+    context: str
+    sources: List[dict]
+    search_results: int
+    processing_time: float
+
+# 사용자 관련 응답 모델
+class UserResponse(BaseModel):
+    success: bool
+    user_id: str
+    username: Optional[str] = None
+    email: Optional[str] = None
+    created_at: Optional[str] = None
+    error: Optional[str] = None
+
+class SessionResponse(BaseModel):
+    success: bool
+    session_id: str
+    session_name: Optional[str] = None
+    created_at: Optional[str] = None
+    error: Optional[str] = None
+
+class ChatMessageResponse(BaseModel):
+    success: bool
+    message_id: int
+    content: str
+    message_type: str
+    timestamp: str
+    error: Optional[str] = None
+
+# 인증 관련 응답 모델
+class LoginResponse(BaseModel):
+    success: bool
+    access_token: Optional[str] = None
+    refresh_token: Optional[str] = None
+    token_type: Optional[str] = None
+    user_id: Optional[str] = None
+    username: Optional[str] = None
+    error: Optional[str] = None
+
+class TokenResponse(BaseModel):
+    success: bool
+    access_token: Optional[str] = None
+    token_type: Optional[str] = None
+    error: Optional[str] = None
 
 # 전역 변수
 model = None
 tokenizer = None
+processor = None
+current_profile = None
 model_loaded = False
+image_processor = None
+executor = concurrent.futures.ThreadPoolExecutor()
 
-@app.on_event("startup")
-async def startup_event():
-    """서버 시작 시 모델 로드"""
-    global model, tokenizer, model_loaded
-    
-    logger.info("🚀 Lily LLM API 서버 시작 중...")
-    logger.info("📝 API 문서: http://localhost:8001/docs")
-    logger.info("🔍 헬스 체크: http://localhost:8001/health")
-    
+def configure_cpu_threads():
+    """CPU 스레드 환경 최적화 (vCPU 수에 맞게 조정)."""
+    print(f"🔍 [DEBUG] configure_cpu_threads 시작")
     try:
-        # 모델 로딩 (비동기로 처리하여 서버 시작 속도 향상)
-        await load_model_async()
-        model_loaded = True
-        logger.info("✅ 모델 로딩 완료!")
+        # 기본값: 환경변수 또는 시스템 CPU 수를 사용하되 과도한 스레드 방지
+        env_threads = os.getenv("CPU_THREADS")
+        if env_threads is not None:
+            threads = max(1, int(env_threads))
+        else:
+            detected = os.cpu_count() or 2
+            # 컨테이너/서버의 vCPU 수를 그대로 사용하되 상한 16 적용
+            threads = max(1, min(detected, 16))
+
+        # OpenMP/MKL/numexpr
+        os.environ["OMP_NUM_THREADS"] = str(threads)
+        os.environ["MKL_NUM_THREADS"] = str(threads)
+        os.environ.setdefault("NUMEXPR_NUM_THREADS", str(threads))
+        os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+
+        # PyTorch 내부 스레드 설정
+        try:
+            torch.set_num_threads(threads)
+        except Exception:
+            pass
+        try:
+            # 연산 간 스레드 풀은 1~2 권장(컨텍스트 스위칭 비용 절감)
+            torch.set_num_interop_threads(1 if threads <= 4 else 2)
+        except Exception:
+            pass
+
+        logger.info(f"🧵 CPU thread config -> OMP/MKL/numexpr={threads}, torch_threads={threads}")
     except Exception as e:
-        logger.error(f"❌ 모델 로딩 실패: {e}")
-        model_loaded = False
+        logger.warning(f"⚠️ CPU 스레드 설정 실패: {e}")
+    print(f"🔍 [DEBUG] configure_cpu_threads 종료")
 
-async def load_model_async():
-    """비동기 모델 로딩"""
-    global model, tokenizer
+def select_model_interactive():
+    """인터랙티브 모델 선택"""
+    available_models = list_available_models()
     
-    # 모델 로딩은 별도 스레드에서 실행
-    import asyncio
-    import concurrent.futures
+    print("\n" + "="*60 + "\n🤖 Lily LLM API v2 - 모델 선택\n" + "="*60)
+    for i, model_info in enumerate(available_models, 1):
+        print(f"{i:2d}. {model_info['name']} ({model_info['model_id']})")
+    while True:
+        try:
+            # choice = input(f"\n📝 사용할 모델 번호를 선택하세요 (1-{len(available_models)}): ")
+            # selected_model = available_models[int(choice) - 1]
+            selected_model = available_models[1]
+            print(f"\n✅ '{selected_model['name']}' 모델을 선택했습니다.")
+            return selected_model['model_id']
+        except (ValueError, IndexError):
+            print(f"❌ 1에서 {len(available_models)} 사이의 숫자를 입력해주세요.")
+        except KeyboardInterrupt: 
+            sys.exit("\n\n👋 프로그램을 종료합니다.")
+
+# @app.on_event("startup") - FastAPI 최신 버전에서 작동하지 않음
+# startup_event 함수는 lifespan으로 이동됨
+
+@app.on_event("shutdown")
+def shutdown_event():
+    executor.shutdown(wait=True)
+
+async def load_model_async(model_id: str):
+    loop = asyncio.get_event_loop()
+    await loop.run_in_executor(executor, load_model_sync, model_id)
+
+@app.post("/load-model")
+async def load_model_endpoint(model_id: str):
+    """모델 로드 HTTP 엔드포인트"""
+    try:
+        logger.info(f"📥 HTTP 요청으로 모델 로드 시작: {model_id}")
+        await load_model_async(model_id)
+        return {"success": True, "message": f"모델 '{model_id}' 로드 완료"}
+    except Exception as e:
+        logger.error(f"❌ HTTP 모델 로드 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+def load_model_sync(model_id: str):
+    """모델 및 관련 프로세서를 동기적으로 로딩 (최종 수정본)"""
+    global model, tokenizer, processor, current_profile, current_model
     
-    def load_model_sync():
-        from transformers import AutoTokenizer, AutoModelForCausalLM
-        from peft import PeftModel
-        import torch
+    try:
+        if model is not None:
+            logger.info("🗑️ 기존 모델 언로드 중...")
+            del model
+            del tokenizer
+            del processor
+            model, tokenizer, processor = None, None, None
+            import gc
+            gc.collect()
+            logger.info("✅ 기존 모델 언로드 완료")
         
-        logger.info("모델 로딩 중...")
+        logger.info(f"📥 '{model_id}' 모델 로딩 시작...")
+        current_profile = get_model_profile(model_id)
         
-        # 로컬 모델 경로 사용
-        local_model_path = "./lily_llm_core/models/polyglot-ko-1.3b"
+        # 이제 load_model은 (model, processor)를 반환합니다.
+        model, processor = current_profile.load_model()
         
-        try:
-            # 로컬 모델과 토크나이저 로드
-            tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)
-            
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
-            
-            # 모델 로드 (CPU에서)
-            model = AutoModelForCausalLM.from_pretrained(
-                local_model_path,
-                torch_dtype=torch.float32,
-                device_map="cpu",
-                low_cpu_mem_usage=True
+        # 🔄 전역 변수에 모델 설정 (LoRA에서 사용)
+        current_model = model
+        
+        # processor에서 tokenizer를 꺼내 전역 변수에 할당합니다.
+        if hasattr(processor, 'tokenizer'):
+            tokenizer = processor.tokenizer
+        else:
+            # processor 자체가 tokenizer 역할도 할 수 있는 경우
+            tokenizer = processor
+        
+        logger.info(f"✅ '{current_profile.display_name}' 모델 로딩 완료!")
+        
+        # 🔄 LoRA 기본 모델 자동 로드 (공통 함수 사용)
+        setup_lora_for_model(current_profile, lora_manager)
+        
+    except Exception as e:
+        logger.error(f"❌ load_model_sync 실패: {e}")
+        import traceback
+        logger.error(f"🔍 전체 에러: {traceback.format_exc()}")
+        raise
+
+def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None, 
+                 temperature: Optional[float] = None, top_p: Optional[float] = None, 
+                 do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
+                 user_id: str = "anonymous", room_id: str = "default") -> dict:
+    """[최적화] 모델 생성을 처리하는 통합 동기 함수"""
+    try:
+        print(f"🔍 [DEBUG] generate_sync 시작 - prompt 길이: {len(prompt)}")
+        print(f"🔍 [DEBUG] 현재 로드된 모델: {current_profile.display_name if current_profile else 'None'}")
+        print(f"🔍 [DEBUG] 모델 타입: {type(current_profile) if current_profile else 'None'}")
+        
+        if current_profile is None:
+            print("❌ [DEBUG] 모델이 로드되지 않음")
+            return {"error": "No model loaded"}
+        
+        print(f"🔍 [DEBUG] 모델 이름: {getattr(current_profile, 'model_name', 'Unknown')}")
+        print(f"🔍 [DEBUG] 멀티모달 지원: {getattr(current_profile, 'multimodal', False)}")
+        print(f"🔍 [DEBUG] 입력 프롬프트: {prompt}")
+        print(f"🔍 [DEBUG] 입력 프롬프트 길이: {len(prompt)}")
+        print(f"🔍 [DEBUG] 이미지 데이터 존재 여부: {image_data_list is not None}")
+        print(f"🔍 [DEBUG] 이미지 데이터 개수: {len(image_data_list) if image_data_list else 0}")
+        print(f"🔍 [DEBUG] 실제 이미지 데이터 개수: {len([img for img in image_data_list if img]) if image_data_list else 0}")
+        
+        image_processed = False
+        all_pixel_values = []
+        combined_image_metas = None
+
+        # --- 1. 이미지 처리 (공식 방식) ---
+        # 🔄 RAG에서 추출된 이미지 데이터도 포함
+        all_image_data = []
+        if image_data_list and len([img for img in image_data_list if img]) > 0:
+            all_image_data.extend(image_data_list)
+            print(f"🔍 [DEBUG] 직접 전달된 이미지 {len(image_data_list)}개 추가")
+        
+        # 🔄 RAG에서 추출된 이미지 데이터는 현재 구현에서 제거됨 (전역 변수 문제 해결)
+        
+        if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
+            print(f"🔍 [DEBUG] 이미지 처리 시작 - 총 이미지 개수: {len([img for img in all_image_data if img])}")
+            
+            # 🔄 공식 방식: 간단한 이미지 처리
+            max_images = min(len(all_image_data), 4)
+            logger.info(f"🖼️  멀티모달 처리 시작... (이미지 {max_images}개)")
+            
+            try:
+                metas_list = []
+                for idx, image_bytes in enumerate(all_image_data[:max_images]):
+                    if image_bytes:
+                        try:
+                            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+                            # 🔄 공식 이미지 프로세서 사용
+                            if processor and hasattr(processor, 'image_processor'):
+                                processed = processor.image_processor(pil_image)
+                                all_pixel_values.append(processed["pixel_values"])
+                                metas_list.append(processed.get("image_meta", {}))
+                            else:
+                                logger.warning(f"⚠️ 이미지 프로세서를 찾을 수 없음")
+                        except Exception as e:
+                            logger.warning(f"⚠️ 이미지 {idx} 처리 실패: {e}")
+                
+                # 🔄 메타데이터 통합 (공식 방식)
+                if metas_list:
+                    combined_image_metas = {}
+                    for key in metas_list[0].keys():
+                        combined_image_metas[key] = [meta[key] for meta in metas_list if key in meta]
+                    print(f"🔍 [DEBUG] 이미지 메타데이터: {combined_image_metas}")
+                else:
+                    combined_image_metas = {}
+            except Exception as e:
+                logger.error(f"❌ 이미지 전처리 실패: {e}")
+                combined_image_metas = {}
+
+        # --- 2. 프롬프트 구성 ---
+        print(f"🔍 [DEBUG] 프롬프트 구성 시작")
+        
+        # 컨텍스트 통합 (대화 기록 + RAG 검색 결과 포함) - 모델별 최적화
+        context_prompt = ""
+        if use_context and session_id:
+            try:
+                # 1. 대화 기록 컨텍스트
+                context = context_manager.get_context_for_model(
+                    current_profile.model_name, 
+                    session_id
+                )
+                if context and len(context.strip()) > 0:
+                    context_prompt = context + "\n\n"
+                    print(f"🔍 [DEBUG] 대화 컨텍스트 포함됨 - 길이: {len(context_prompt)} (세션: {session_id})")
+                
+                # 2. RAG 검색 결과 컨텍스트 (PDF 내용 포함)
+                try:
+                    # �� 새로운 메모리 시스템을 사용한 RAG 컨텍스트 로드
+                    rag_context = ""
+                    
+                    # 🔒 사용자 설정 확인
+                    from lily_llm_core.user_memory_manager import user_memory_manager
+                    keep_memory = user_memory_manager.get_memory_setting(user_id, "keep_memory_on_room_change")
+                    
+                    if keep_memory:
+                        # 메모리 유지 모드 - 기존 로직 실행
+                        print(f"🔄 [DEBUG] 사용자 {user_id} 메모리 유지 모드 - RAG 컨텍스트 로드")
+                        
+                        # 통합 메모리 관리자에서 AI용 컨텍스트 생성
+                        ai_context = integrated_memory_manager.get_context_for_ai(
+                            user_id=user_id,
+                            room_id=room_id,
+                            session_id=session_id,
+                            include_user_memory=True,
+                            include_room_context=True,
+                            include_session_history=False  # 현재 대화는 별도로 처리
+                        )
+                        
+                        if ai_context:
+                            rag_context += f"\n\n🔗 메모리 컨텍스트:\n{ai_context}\n"
+                            print(f"🔍 [DEBUG] 메모리 컨텍스트 포함됨 - 길이: {len(ai_context)}")
+                        
+                        # 기존 RAG 시스템에서 문서 내용 가져오기 (room_id 기반)
+                        try:
+                            # 채팅방별 문서 컨텍스트 조회
+                            room_context = integrated_memory_manager.room_context_manager.get_room_context(room_id)
+                            if room_context and room_context.documents:
+                                rag_context += "\n\n📄 업로드된 문서 목록:\n"
+                                for doc in room_context.documents[-3:]:  # 최근 3개만
+                                    # 딕셔너리와 객체 모두 처리
+                                    if isinstance(doc, dict):
+                                        filename = doc.get('filename', 'unknown')
+                                        doc_type = doc.get('document_type', 'unknown')
+                                        page_count = doc.get('page_count', 0)
+                                    else:
+                                        filename = getattr(doc, 'filename', 'unknown')
+                                        doc_type = getattr(doc, 'document_type', 'unknown')
+                                        page_count = getattr(doc, 'page_count', 0)
+                                    
+                                    rag_context += f"  - {filename} ({doc_type}, {page_count}페이지)\n"
+                                
+                                print(f"🔍 [DEBUG] 채팅방 {room_id}의 문서 {len(room_context.documents)}개 발견")
+                        
+                        except Exception as e:
+                            print(f"⚠️ 채팅방 문서 컨텍스트 로드 실패: {e}")
+                        
+                        # 🔒 문서 내용 자체는 로드하지 않음 (턴별 초기화)
+                        # 이전 턴에서 첨부된 문서의 실제 내용은 AI 컨텍스트에 포함하지 않음
+                        print(f"�� [DEBUG] 문서 내용 로드 건너뛰기 - 턴별 초기화 적용")
+                        
+                        # �� 현재 턴에서만 문서 정보 표시 (실제 내용은 로드하지 않음)
+                        if rag_context:
+                            context_prompt += rag_context
+                            print(f"🔍 [DEBUG] 문서 목록만 표시 - 실제 내용 로드 안함 (턴별 초기화)")
+                    
+                except Exception as e:
+                    print(f"⚠️ [DEBUG] RAG 컨텍스트 처리 실패: {e}")
+                
+                if not context_prompt:
+                    print(f"�� [DEBUG] 컨텍스트 없음 또는 비어있음 (세션: {session_id})")
+                    
+            except Exception as e:
+                print(f"⚠️ [DEBUG] 컨텍스트 로드 실패: {e} (세션: {session_id})")
+                context_prompt = ""
+        
+        # formatted_prompt 초기화
+        formatted_prompt = None
+        
+        # 🔄 멀티모달 프롬프트 구성 (공식 방식)
+        if all_pixel_values and len(all_pixel_values) > 0:
+            # 🔄 공식 Kanana 형식: Human: <image> 텍스트
+            # 이미지 토큰은 encode_prompt에서 자동으로 처리됨
+            formatted_prompt = f"Human: <image>{prompt}"
+            print(f"🔍 [DEBUG] 멀티모달 프롬프트 구성 (공식 형식): {formatted_prompt}")
+            image_processed = True
+        else:
+            image_tokens = ""
+            image_processed = False
+            print(f"🔍 [DEBUG] 이미지 없음 - 텍스트-only 모드")
+            
+            # 텍스트-only 모델용 프롬프트 구성 (컨텍스트 포함)
+            if hasattr(current_profile, 'format_prompt'):
+                # Polyglot 모델일 때는 format_prompt 메서드 사용 (컨텍스트 지원)
+                if "polyglot" in current_profile.model_name.lower():
+                    # 컨텍스트와 프롬프트를 함께 전달
+                    formatted_prompt = current_profile.format_prompt(prompt, context_prompt)
+                else:
+                    # 다른 모델은 기존 방식 사용
+                    base_prompt = current_profile.format_prompt(prompt)
+                    if context_prompt:
+                        formatted_prompt = context_prompt + base_prompt
+                    else:
+                        formatted_prompt = base_prompt
+                print(f"🔍 [DEBUG] 프로필 format_prompt 사용 (컨텍스트 포함): {formatted_prompt}")
+            else:
+                # 기본 프롬프트 (fallback) - 컨텍스트 포함
+                # Polyglot 모델은 <|im_start|> 태그를 제대로 처리하지 못함
+                if "polyglot" in current_profile.model_name.lower():
+                    base_prompt = f"### 사용자:\n{prompt}\n\n### 챗봇:\n"
+                else:
+                    base_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+                
+                if context_prompt:
+                    formatted_prompt = context_prompt + base_prompt
+                else:
+                    formatted_prompt = base_prompt
+                print(f"🔍 [DEBUG] 기본 프롬프트 사용 (컨텍스트 포함): {formatted_prompt}")
+        
+        print(f"🔍 [DEBUG] 프롬프트 구성 완료 - 길이: {len(formatted_prompt) if formatted_prompt else 0}")
+        print(f"🔍 [DEBUG] 최종 프롬프트: {formatted_prompt}")
+        
+        # --- 3. 토크나이징 ---
+        print(f"🔍 [DEBUG] 토크나이징 시작")
+        t_tok_start = time.time()
+        if not all_image_data or len([img for img in all_image_data if img]) == 0:
+            # 텍스트-only 고정 경로 (더 빠름)
+            print(f"🔍 [DEBUG] 텍스트-only 토크나이징 경로")
+            print(f"🔍 [DEBUG] 사용할 프롬프트: {formatted_prompt}")
+            
+            inputs = tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=2048,
             )
+            if 'token_type_ids' in inputs:
+                del inputs['token_type_ids']
+                print(f"🔍 [DEBUG] token_type_ids 제거됨")
+            
+            input_ids = inputs['input_ids']
+            attention_mask = inputs['attention_mask']
+            print(f"🔍 [DEBUG] 토크나이저 출력: {list(inputs.keys())}")
+        else:
+            # 멀티모달(Lite): Kanana 전용 encode_prompt로 -1 토큰 자리 생성 (필수)
+            print(f"🔍 [DEBUG] 멀티모달 토크나이징 경로")
+            print(f"🔍 [DEBUG] combined_image_metas: {combined_image_metas}")
+            print(f"🔍 [DEBUG] 총 이미지 개수: {len(all_image_data)}")
+            
+            if hasattr(tokenizer, 'encode_prompt'):
+                print(f"🔍 [DEBUG] encode_prompt 메서드 사용")
+                
+                # 🔄 메타데이터 검증 및 안전화
+                safe_image_meta = {}
+                if combined_image_metas:
+                    # image_token_thw 배열 길이 검증
+                    if 'image_token_thw' in combined_image_metas:
+                        image_token_thw = combined_image_metas['image_token_thw']
+                        if isinstance(image_token_thw, list) and len(image_token_thw) > 0:
+                            # 배열 길이가 이미지 개수와 일치하는지 확인
+                            if len(image_token_thw) == len(all_pixel_values):
+                                # 🔄 추가 검증: 각 배열 요소가 유효한지 확인
+                                valid_meta = True
+                                for i, thw in enumerate(image_token_thw):
+                                    if not isinstance(thw, (list, tuple)) or len(thw) != 3:
+                                        print(f"⚠️ [DEBUG] 메타데이터 요소 {i}가 유효하지 않음: {thw}")
+                                        valid_meta = False
+                                        break
+                                
+                                if valid_meta:
+                                    safe_image_meta = combined_image_metas
+                                    print(f"🔍 [DEBUG] 메타데이터 검증 통과: {len(image_token_thw)}개 이미지")
+                                else:
+                                    print(f"⚠️ [DEBUG] 메타데이터 요소 검증 실패, 기본값 사용")
+                                    safe_image_meta = {
+                                        'image_token_thw': [[1, 1, 1]] * len(all_pixel_values),
+                                        'vision_grid_thw': [[1, 1, 1]] * len(all_pixel_values)
+                                    }
+                            else:
+                                print(f"⚠️ [DEBUG] 메타데이터 불일치: 이미지 {len(all_pixel_values)}개, 메타 {len(image_token_thw)}개")
+                                # 안전한 기본값 사용
+                                safe_image_meta = {
+                                    'image_token_thw': [[1, 1, 1]] * len(all_pixel_values),
+                                    'vision_grid_thw': [[1, 1, 1]] * len(all_pixel_values)
+                                }
+                        else:
+                            print(f"⚠️ [DEBUG] image_token_thw가 유효하지 않음, 기본값 사용")
+                            safe_image_meta = {
+                                'image_token_thw': [[1, 1, 1]] * len(all_pixel_values),
+                                'vision_grid_thw': [[1, 1, 1]] * len(all_pixel_values)
+                            }
+                    else:
+                        print(f"⚠️ [DEBUG] image_token_thw 없음, 기본값 생성")
+                        safe_image_meta = {
+                            'image_token_thw': [[1, 1, 1]] * len(all_pixel_values),
+                            'vision_grid_thw': [[1, 1, 1]] * len(all_pixel_values)
+                        }
+                else:
+                    print(f"⚠️ [DEBUG] combined_image_metas 없음, 기본값 생성")
+                    safe_image_meta = {
+                        'image_token_thw': [[1, 1, 1]] * len(all_pixel_values),
+                        'vision_grid_thw': [[1, 1, 1]] * len(all_pixel_values)
+                    }
+                
+                print(f"🔍 [DEBUG] 안전화된 메타데이터: {safe_image_meta}")
+                
+                # 🔄 안전한 메타데이터로 encode_prompt 호출
+                try:
+                    # 🔄 추가 안전장치: 메타데이터 복사본 생성
+                    final_meta = {}
+                    for key, value in safe_image_meta.items():
+                        if isinstance(value, list):
+                            final_meta[key] = value.copy()  # 복사본 생성
+                        else:
+                            final_meta[key] = value
+                    
+                    print(f"🔍 [DEBUG] 최종 메타데이터: {final_meta}")
+                    # 🔄 공식 방식: max_length 파라미터 추가
+                    inputs = tokenizer.encode_prompt(
+                        prompt=formatted_prompt, 
+                        max_length=2048,  # 공식 코드와 동일
+                        image_meta=final_meta
+                    )
+                    print(f"🔍 [DEBUG] encode_prompt 출력: {list(inputs.keys())}")
+                    
+                    # 🔄 encode_prompt 출력 정규화 (seq_length 제거)
+                    if 'seq_length' in inputs:
+                        print(f"🔍 [DEBUG] seq_length 제거됨")
+                        del inputs['seq_length']
+                    
+                    # 🔄 input_ids 안전하게 추출 (공식 방식)
+                    if isinstance(inputs['input_ids'], tuple):
+                        print(f"🔍 [DEBUG] input_ids가 튜플임: {len(inputs['input_ids'])}개 요소")
+                        input_ids = inputs['input_ids'][0]  # 첫 번째 요소 사용
+                        print(f"🔍 [DEBUG] input_ids 튜플에서 첫 번째 요소 추출: {input_ids.shape}")
+                    else:
+                        input_ids = inputs['input_ids']
+                    
+                    # 🔄 attention_mask도 안전하게 추출
+                    if isinstance(inputs['attention_mask'], tuple):
+                        print(f"🔍 [DEBUG] attention_mask가 튜플임: {len(inputs['attention_mask'])}개 요소")
+                        attention_mask = inputs['attention_mask'][0]  # 첫 번째 요소 사용
+                        print(f"🔍 [DEBUG] attention_mask 튜플에서 첫 번째 요소 추출: {attention_mask.shape}")
+                    else:
+                        attention_mask = inputs['attention_mask']
+                    
+                    # 🔄 최종 검증
+                    print(f"🔍 [DEBUG] 최종 input_ids 타입: {type(input_ids)}, shape: {input_ids.shape}")
+                    print(f"🔍 [DEBUG] 최종 attention_mask 타입: {type(attention_mask)}, shape: {attention_mask.shape}")
+                except Exception as e:
+                    print(f"❌ [DEBUG] encode_prompt 실패: {e}, 폴백 사용")
+                    # 폴백: 기본 토크나이저 사용
+                    inputs = tokenizer(
+                        formatted_prompt,
+                        return_tensors="pt",
+                        padding=True,
+                        truncation=True,
+                        max_length=2048,
+                    )
+                    if 'token_type_ids' in inputs:
+                        del inputs['token_type_ids']
+                    input_ids = inputs['input_ids']
+                    attention_mask = inputs['attention_mask']
+            else:
+                # 안전 폴백
+                print(f"🔍 [DEBUG] 기본 토크나이저 사용 (폴백)")
+                inputs = tokenizer(
+                    formatted_prompt,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_lengt=2048,
+                )
+                if 'token_type_ids' in inputs:
+                    del inputs['token_type_ids']
+                    print(f"🔍 [DEBUG] token_type_ids 제거됨 (폴백)")
+                input_ids = inputs['input_ids']
+                attention_mask = inputs['attention_mask']
+                print(f"🔍 [DEBUG] 기본 토크나이저 출력: {list(inputs.keys())}")
+        
+        t_tok_end = time.time()
+        print(f"🔍 [DEBUG] 토크나이징 완료 - 소요시간: {t_tok_end - t_tok_start:.3f}초")
+        # 🔄 input_ids 안전하게 처리
+        if isinstance(input_ids, tuple):
+            print(f"🔍 [DEBUG] input_ids가 튜플임: {len(input_ids)}개 요소")
+            input_ids = input_ids[0]  # 첫 번째 요소 사용
+            print(f"🔍 [DEBUG] input_ids 튜플에서 첫 번째 요소 추출: {input_ids.shape}")
+        
+        # 🔄 1차원 텐서를 2차원으로 reshape
+        if len(input_ids.shape) == 1:
+            print(f"🔍 [DEBUG] 1차원 텐서를 2차원으로 reshape: {input_ids.shape} -> (1, {input_ids.shape[0]})")
+            input_ids = input_ids.unsqueeze(0)  # (seq_len,) -> (1, seq_len)
+        
+        # 🔄 attention_mask도 동일하게 처리
+        if len(attention_mask.shape) == 1:
+            print(f"🔍 [DEBUG] attention_mask 1차원을 2차원으로 reshape: {attention_mask.shape} -> (1, {attention_mask.shape[0]})")
+            attention_mask = attention_mask.unsqueeze(0)  # (seq_len,) -> (1, seq_len)
+        
+        print(f"🔍 [DEBUG] 최종 input_ids shape: {input_ids.shape}")
+        print(f"🔍 [DEBUG] 입력 토큰 수: {input_ids.shape[1]}")
+        
+        # --- 4. 생성 설정 ---
+        print(f"🔍 [DEBUG] 생성 설정 구성 시작")
+        gen_config = current_profile.get_generation_config()
+        
+        # config 파일에 명시된 eos, pad, bos 토큰 id 기본값으로 채우기
+        if 'eos_token_id' not in gen_config or gen_config['eos_token_id'] is None:
+            gen_config['eos_token_id'] = tokenizer.eos_token_id
+
+        if 'pad_token_id' not in gen_config or gen_config['pad_token_id'] is None:
+            gen_config['pad_token_id'] = tokenizer.pad_token_id or tokenizer.eos_token_id
+
+        # 필요할 경우 bos_token_id 도 설정 (generate 함수에 따라 다름)
+        if 'bos_token_id' not in gen_config and hasattr(tokenizer, 'bos_token_id'):
+            gen_config['bos_token_id'] = tokenizer.bos_token_id
+
+        # max_new_tokens, temperature 등 API 인자 받아서 덮어쓰기
+        if max_length is not None:
+            gen_config['max_new_tokens'] = max_length
+
+        if temperature is not None:
+            gen_config['temperature'] = temperature
+
+        if top_p is not None:
+            gen_config['top_p'] = top_p
+
+        if do_sample is not None:
+            gen_config['do_sample'] = do_sample
+        
+        print(f"🔍 [DEBUG] 생성 설정: {gen_config}")
+        
+        # --- 5. 실제 추론 실행 ---
+        print(f"🔍 [DEBUG] 모델 추론 시작")
+        t_gen_start = time.time()
+        
+        try:
+            # 모델 상태 확인
+            print(f"🔍 [DEBUG] 모델 디바이스: {model.device}")
+            print(f"🔍 [DEBUG] 입력 텐서 디바이스: {input_ids.device}")
+            print(f"🔍 [DEBUG] 모델 타입: {type(model)}")
+            print(f"🔍 [DEBUG] 모델 상태: {'eval' if model.training == False else 'training'}")
+            print(f"🔍 [DEBUG] 입력 텐서 shape: {input_ids.shape}")
+            print(f"🔍 [DEBUG] attention_mask shape: {attention_mask.shape}")
+            print(f"🔍 [DEBUG] all_pixel_values 존재 여부: {all_pixel_values is not None}")
+            print(f"🔍 [DEBUG] all_pixel_values 길이: {len(all_pixel_values) if all_pixel_values else 0}")
             
-            logger.info("✅ polyglot-ko-1.3b 모델 로드 성공!")
-            return model, tokenizer
+            # 입력 텐서를 모델 디바이스로 이동
+            if input_ids.device != model.device:
+                print(f"🔍 [DEBUG] 입력 텐서를 모델 디바이스로 이동: {input_ids.device} -> {model.device}")
+                input_ids = input_ids.to(model.device)
+                attention_mask = attention_mask.to(model.device)
+            
+            # 🔄 torch import 문제 해결
+            import torch
+            with torch.no_grad():
+                if all_pixel_values and len(all_pixel_values) > 0:
+                    # 멀티모달: 이미지와 텍스트 함께 처리
+                    print(f"🔍 [DEBUG] 멀티모달 추론 실행")
+                    print(f"🔍 [DEBUG] 이미지 텐서 개수: {len(all_pixel_values)}")
+                    
+                    # 이미지 텐서도 디바이스 확인
+                    pixel_values = torch.cat(all_pixel_values, dim=0)
+                    print(f"🔍 [DEBUG] 결합된 이미지 텐서 shape: {pixel_values.shape}")
+                    print(f"🔍 [DEBUG] 이미지 텐서 dtype: {pixel_values.dtype}")
+                    
+                    # 🔄 모델과 동일한 dtype으로 변환 (성능 최적화)
+                    if hasattr(model, 'dtype'):
+                        target_dtype = model.dtype
+                        if pixel_values.dtype != target_dtype:
+                            print(f"🔍 [DEBUG] 이미지 텐서 dtype 변환: {pixel_values.dtype} -> {target_dtype}")
+                            pixel_values = pixel_values.to(dtype=target_dtype)
+                    else:
+                        # 🔄 모델 dtype을 알 수 없는 경우 bfloat16 사용 (Kanana 모델 기본값)
+                        target_dtype = torch.bfloat16
+                        if pixel_values.dtype != target_dtype:
+                            print(f"🔍 [DEBUG] 이미지 텐서 dtype 변환: {pixel_values.dtype} -> {target_dtype}")
+                            pixel_values = pixel_values.to(dtype=target_dtype)
+                    
+                    if pixel_values.device != model.device:
+                        print(f"🔍 [DEBUG] 이미지 텐서를 모델 디바이스로 이동: {pixel_values.device} -> {model.device}")
+                        pixel_values = pixel_values.to(model.device)
+                    
+                    print(f"🔍 [DEBUG] 최종 이미지 텐서 디바이스: {pixel_values.device}")
+                    print(f"🔍 [DEBUG] 최종 이미지 텐서 dtype: {pixel_values.dtype}")
+                    print(f"🔍 [DEBUG] 모델 생성 시작 - 멀티모달")
+                    
+                    # LoRA 어댑터가 적용된 모델인지 확인
+                    if LORA_AVAILABLE and lora_manager and hasattr(lora_manager, 'current_adapter_name') and lora_manager.current_adapter_name:
+                        print(f"🔍 [DEBUG] LoRA 어댑터 적용됨 (멀티모달): {lora_manager.current_adapter_name}")
+                        # LoRA가 적용된 모델 사용
+                        lora_model = lora_manager.get_model()
+                        if lora_model:
+                            print(f"🔍 [DEBUG] LoRA 모델로 멀티모달 생성 실행")
+                            # 🔄 image_metas 파라미터 추가 (공식 방식)
+                            # 🔄 메타데이터를 공식 구조로 변환 (모델 요구사항)
+                            import torch
+                            processed_image_metas = {}
+                            
+                            # 🔄 공식 방식: vision_grid_thw를 텐서로 변환
+                            if 'vision_grid_thw' in combined_image_metas:
+                                vision_grid = combined_image_metas['vision_grid_thw']
+                                if isinstance(vision_grid, list):
+                                    # 🔄 Kanana 모델 요구사항: (T, H, W) 형태의 3차원 텐서
+                                    if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
+                                        # [(1, 34, 52)] -> (1, 34, 52) 텐서로 변환
+                                        t, h, w = vision_grid[0]
+                                        # 🔄 3차원 텐서로 변환: (1, H, W) 형태
+                                        processed_image_metas['vision_grid_thw'] = torch.tensor([[t, h, w]], dtype=torch.long)
+                                        print(f"🔍 [DEBUG] vision_grid_thw 텐서 변환: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
+                                    else:
+                                        # 🔄 다른 형태의 경우 원본 유지
+                                        processed_image_metas['vision_grid_thw'] = torch.tensor(vision_grid, dtype=torch.long)
+                                        print(f"🔍 [DEBUG] vision_grid_thw 텐서 변환 (기본): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
+                                else:
+                                    processed_image_metas['vision_grid_thw'] = vision_grid
+                            
+                            # 🔄 다른 메타데이터는 그대로 유지
+                            for key, value in combined_image_metas.items():
+                                if key != 'vision_grid_thw':
+                                    processed_image_metas[key] = value
+                            
+                            generate_kwargs = {
+                                'input_ids': input_ids,
+                                'attention_mask': attention_mask,
+                                'pixel_values': pixel_values,
+                                'image_metas': processed_image_metas,  # 🔄 처리된 이미지 메타데이터
+                                **gen_config
+                            }
+                            print(f"🔍 [DEBUG] LoRA 모델 생성 파라미터: {list(generate_kwargs.keys())}")
+                            print(f"🔍 [DEBUG] 처리된 image_metas: {list(processed_image_metas.keys())}")
+                            print(f"🔍 [DEBUG] 모델 생성 시작... (타임아웃 없음)")
+                            
+                            # 🔄 생성 전 최종 검증
+                            print(f"🔍 [DEBUG] 최종 파라미터 검증:")
+                            print(f"  - input_ids: {input_ids.shape}, dtype: {input_ids.dtype}")
+                            print(f"  - attention_mask: {attention_mask.shape}, dtype: {attention_mask.dtype}")
+                            print(f"  - pixel_values: {pixel_values.shape}, dtype: {pixel_values.dtype}")
+                            print(f"  - vision_grid_thw: {processed_image_metas.get('vision_grid_thw', 'None')}")
+                            
+                            generated_ids = lora_model.generate(**generate_kwargs)
+                        else:
+                            print(f"⚠️ [DEBUG] LoRA 모델을 가져올 수 없음, 기본 모델 사용")
+                            # 🔄 image_metas 파라미터 추가 (공식 방식)
+                            # 🔄 메타데이터를 공식 구조로 변환 (모델 요구사항)
+                            processed_image_metas = {}
+                            
+                            # 🔄 공식 방식: vision_grid_thw를 텐서로 변환
+                            if 'vision_grid_thw' in combined_image_metas:
+                                vision_grid = combined_image_metas['vision_grid_thw']
+                                if isinstance(vision_grid, list):
+                                    # 🔄 Kanana 모델 요구사항: (T, H, W) 형태의 3차원 텐서
+                                    if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
+                                        # [(1, 34, 52)] -> (1, 34, 52) 텐서로 변환
+                                        t, h, w = vision_grid[0]
+                                        # 🔄 3차원 텐서로 변환: (1, H, W) 형태
+                                        processed_image_metas['vision_grid_thw'] = torch.tensor([[t, h, w]], dtype=torch.long)
+                                        print(f"🔍 [DEBUG] vision_grid_thw 텐서 변환: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
+                                    else:
+                                        # 🔄 다른 형태의 경우 원본 유지
+                                        processed_image_metas['vision_grid_thw'] = torch.tensor(vision_grid, dtype=torch.long)
+                                        print(f"🔍 [DEBUG] vision_grid_thw 텐서 변환 (기본): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
+                                else:
+                                    processed_image_metas['vision_grid_thw'] = vision_grid
+                            
+                            # 🔄 다른 메타데이터는 그대로 유지
+                            for key, value in combined_image_metas.items():
+                                if key != 'vision_grid_thw':
+                                    processed_image_metas[key] = value
+                            
+                            generate_kwargs = {
+                                'input_ids': input_ids,
+                                'attention_mask': attention_mask,
+                                'pixel_values': pixel_values,
+                                'image_metas': processed_image_metas,  # 🔄 처리된 이미지 메타데이터
+                                **gen_config
+                            }
+                            print(f"🔍 [DEBUG] 기본 모델 생성 파라미터: {list(generate_kwargs.keys())}")
+                            print(f"🔍 [DEBUG] 처리된 image_metas: {list(processed_image_metas.keys())}")
+                            generated_ids = model.generate(**generate_kwargs)
+                    else:
+                        print(f"🔍 [DEBUG] LoRA 어댑터 없음 (멀티모달), 기본 모델 사용")
+                        # 🔄 image_metas 파라미터 추가 (공식 방식)
+                        # 🔄 메타데이터를 공식 구조로 변환 (모델 요구사항)
+                        processed_image_metas = {}
+                        
+                        # 🔄 공식 방식: vision_grid_thw를 텐서로 변환
+                        if 'vision_grid_thw' in combined_image_metas:
+                            vision_grid = combined_image_metas['vision_grid_thw']
+                            if isinstance(vision_grid, list):
+                                # 🔄 Kanana 모델 요구사항: (T, H, W) 형태의 3차원 텐서
+                                if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
+                                    # [(1, 34, 52)] -> (1, 34, 52) 텐서로 변환
+                                    t, h, w = vision_grid[0]
+                                    # 🔄 3차원 텐서로 변환: (1, H, W) 형태
+                                    processed_image_metas['vision_grid_thw'] = torch.tensor([[t, h, w]], dtype=torch.long)
+                                    print(f"🔍 [DEBUG] vision_grid_thw 텐서 변환: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
+                                else:
+                                    # 🔄 다른 형태의 경우 원본 유지
+                                    processed_image_metas['vision_grid_thw'] = torch.tensor(vision_grid, dtype=torch.long)
+                                    print(f"🔍 [DEBUG] vision_grid_thw 텐서 변환 (기본): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
+                            else:
+                                processed_image_metas['vision_grid_thw'] = vision_grid
+                        
+                        # 🔄 다른 메타데이터는 그대로 유지
+                        for key, value in combined_image_metas.items():
+                            if key != 'vision_grid_thw':
+                                processed_image_metas[key] = value
+                        
+                        generate_kwargs = {
+                            'input_ids': input_ids,
+                            'attention_mask': attention_mask,
+                            'pixel_values': pixel_values,
+                            'image_metas': processed_image_metas,  # 🔄 처리된 이미지 메타데이터
+                            **gen_config
+                        }
+                        print(f"🔍 [DEBUG] 기본 모델 생성 파라미터: {list(generate_kwargs.keys())}")
+                        print(f"🔍 [DEBUG] 처리된 image_metas: {list(processed_image_metas.keys())}")
+                        generated_ids = model.generate(**generate_kwargs)
+
+                                        # 토큰 설정을 명시적으로 전달하여 EOS 토큰 문제 해결
+                    # generate_kwargs = {
+                    #     'input_ids': input_ids.to(model.device),
+                    #     'attention_mask': attention_mask.to(model.device),
+                    #     'pixel_values': pixel_values.to(model.device),
+                    #     'max_new_tokens': gen_config['max_new_tokens'],
+                    #     'temperature': gen_config['temperature'],
+                    #     'top_p': gen_config['top_p'],
+                    #     'do_sample': gen_config['do_sample'],
+                    #     'repetition_penalty': gen_config.get('repetition_penalty', 1.0),
+                    #     'no_repeat_ngram_size': gen_config.get('no_repeat_ngram_size', 0),
+                    #     # 'num_beams': gen_config.get('num_beams', 1),
+                    #     'use_cache': gen_config.get('use_cache', True),
+                    #     'max_time': gen_config.get('max_time', None),
+                    #     'early_stopping': gen_config.get('early_stopping', False),
+                    #     'stopping_criteria': gen_config.get('stopping_criteria', None),
+                    # }
+                    # 
+                    # # 토큰 ID 설정 (중요!)
+                    # if gen_config.get('eos_token_id') is not None:
+                    #     generate_kwargs['eos_token_id'] = gen_config['eos_token_id']
+                    # if gen_config.get('pad_token_id') is not None:
+                    #     generate_kwargs['pad_token_id'] = gen_config['pad_token_id']
+                    # if gen_config.get('bos_token_id') is not None:
+                    #     generate_kwargs['bos_token_id'] = gen_config['bos_token_id']
+                    # 
+                    # print(f"🔍 [DEBUG] 최종 생성 설정: {generate_kwargs}")
+                    # 
+                    # generated_ids = model.generate(**generate_kwargs)
+                                                        
+                else:
+                    # 텍스트-only: 기존 방식
+                    print(f"🔍 [DEBUG] 텍스트-only 추론 실행")
+                    print(f"🔍 [DEBUG] 생성 설정: {gen_config}")
+                    
+                    # 타임아웃 설정을 위한 추가 설정 (더 적절한 값으로 조정)
+                    # if 'max_time' not in gen_config:
+                    #     gen_config['max_time'] = 60.0  # 60초 타임아웃으로 조정
+                    
+                    # 추가 타임아웃 설정
+                    # gen_config['max_time'] = 60.0  # 강제 60초 타임아웃
+                    # print(f"🔍 [DEBUG] 강제 타임아웃 설정: {gen_config['max_time']}초")
+                    
+                    # 추가 성능 최적화 설정
+                    gen_config['use_cache'] = True  # 캐시 사용으로 속도 향상
+                    
+                    # PAD 토큰 설정 - 모델 프로필 설정 우선
+                    if 'pad_token_id' not in gen_config:
+                        # 프로필에 설정이 없을 때만 기본값 사용
+                        if tokenizer.pad_token_id is not None:
+                            gen_config['pad_token_id'] = tokenizer.pad_token_id
+                            print(f"🔍 [DEBUG] PAD 토큰 설정: 토크나이저 기본값 사용 (ID: {tokenizer.pad_token_id})")
+                        else:
+                            gen_config['pad_token_id'] = None
+                            print(f"🔍 [DEBUG] PAD 토큰 설정: None (토크나이저에 PAD 토큰 없음)")
+                    
+                    # 토큰 설정 - 프로필에서 설정된 값 우선 사용
+                    if 'eos_token_id' not in gen_config or gen_config['eos_token_id'] is None:
+                        if tokenizer.eos_token_id is not None:
+                            gen_config['eos_token_id'] = tokenizer.eos_token_id
+                            print(f"🔍 [DEBUG] EOS 토큰 설정: {tokenizer.eos_token_id}")
+                        else:
+                            gen_config['eos_token_id'] = None
+                            print(f"🔍 [DEBUG] EOS 토큰 설정: None (자동 처리)")
+                    
+                    if 'pad_token_id' not in gen_config or gen_config['pad_token_id'] is None:
+                        if tokenizer.pad_token_id is not None:
+                            gen_config['pad_token_id'] = tokenizer.pad_token_id
+                        else:
+                            gen_config['pad_token_id'] = None
+                    
+                    if 'bos_token_id' not in gen_config or gen_config['bos_token_id'] is None:
+                        if hasattr(tokenizer, 'bos_token_id') and tokenizer.bos_token_id is not None:
+                            gen_config['bos_token_id'] = tokenizer.bos_token_id
+                        else:
+                            gen_config['bos_token_id'] = None
+                    
+                    print(f"🔍 [DEBUG] 최종 토큰 설정: EOS={gen_config['eos_token_id']}, PAD={gen_config['pad_token_id']}, BOS={gen_config.get('bos_token_id')}")
+                    
+                    # 생성 설정 최종 확인
+                    print(f"🔍 [DEBUG] 최종 생성 설정: {gen_config}")
+                    
+                    print(f"🔍 [DEBUG] 모델 생성 시작 - 텍스트만")
+                    print(f"🔍 [DEBUG] 최종 입력 텐서 디바이스: {input_ids.device}")
+                    print(f"🔍 [DEBUG] 최종 attention_mask 디바이스: {attention_mask.device}")
+                    
+                    # 모델 생성 진행 상황 ���니터링을 위한 콜백 추가
+                    print(f"🔍 [DEBUG] 모델 생성 시작 시간: {time.time()}")
+                    
+                    # LoRA 어댑터가 적용된 모델인지 확인
+                    if LORA_AVAILABLE and lora_manager and hasattr(lora_manager, 'current_adapter_name') and lora_manager.current_adapter_name:
+                        print(f"🔍 [DEBUG] LoRA 어댑터 적용됨: {lora_manager.current_adapter_name}")
+                        # LoRA가 적용된 모델 사용
+                        lora_model = lora_manager.get_model()
+                        if lora_model:
+                            print(f"🔍 [DEBUG] LoRA 모델로 생성 실행")
+                            # LoRA 모델용 입력 처리 (token_type_ids 제거)
+                            lora_inputs = {
+                                'input_ids': input_ids,
+                                'attention_mask': attention_mask
+                            }
+                            # token_type_ids가 있다면 제거
+                            # if 'token_type_ids' in locals() and token_type_ids is not None:
+                            #     print(f"🔍 [DEBUG] token_type_ids 제거됨 (LoRA 모델 호환성)")
+                            
+                            generated_ids = lora_model.generate(
+                                **lora_inputs,
+                                **gen_config
+                            )
+                        else:
+                            print(f"⚠️ [DEBUG] LoRA 모델을 가져올 수 없음, 기본 모델 사용")
+                            generated_ids = model.generate(
+                                input_ids=input_ids,
+                                attention_mask=attention_mask,
+                                **gen_config
+                            )
+                    else:
+                        print(f"🔍 [DEBUG] LoRA 어댑터 없음, 기본 모델 사용")
+                        # LoRA 상태 디버깅
+                        if LORA_AVAILABLE:
+                            if lora_manager:
+                                print(f"🔍 [DEBUG] LoRA 매니저 존재: {type(lora_manager)}")
+                                if hasattr(lora_manager, 'current_adapter_name'):
+                                    print(f"🔍 [DEBUG] 현재 어댑터: {lora_manager.current_adapter_name}")
+                                if hasattr(lora_manager, 'base_model'):
+                                    print(f"🔍 [DEBUG] 기본 모델 로드됨: {lora_manager.base_model is not None}")
+                            else:
+                                print(f"🔍 [DEBUG] LoRA 매니저가 None")
+                        else:
+                            print(f"🔍 [DEBUG] LoRA 지원 안됨")
+                        
+                        generated_ids = model.generate(
+                            input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            **gen_config
+                        )
+                    
+                    # 토큰 설정을 명시적으로 전달하여 EOS 토큰 문제 해결
+                    # generate_kwargs = {
+                    #      'input_ids': input_ids.to(model.device),
+                    #      'attention_mask': attention_mask.to(model.device),
+                    #      'max_new_tokens': gen_config['max_new_tokens'],
+                    #      'temperature': gen_config['temperature'],
+                    #      'top_p': gen_config['top_p'],
+                    #      'do_sample': gen_config['do_sample'],
+                    #      'repetition_penalty': gen_config.get('repetition_penalty', 1.0),
+                    #      'no_repeat_ngram_size': gen_config.get('no_repeat_ngram_size', 0),
+                    #     #  'num_beams': gen_config.get('num_beams', 1),
+                    #      'use_cache': gen_config.get('use_cache', True),
+                    #      'max_time': gen_config.get('max_time', None),
+                    #      'early_stopping': gen_config.get('early_stopping', False),
+                    #      'stopping_criteria': gen_config.get('stopping_criteria', None),
+                    #  }
+                    #  
+                    # # 토큰 ID 설정 (중요!)
+                    # if gen_config.get('eos_token_id') is not None:
+                    #     generate_kwargs['eos_token_id'] = gen_config['eos_token_id']
+                    # if gen_config.get('pad_token_id') is not None:
+                    #     generate_kwargs['pad_token_id'] = gen_config['pad_token_id']
+                    # if gen_config.get('bos_token_id') is not None:
+                    #     generate_kwargs['bos_token_id'] = gen_config['bos_token_id']
+                    
+                    # print(f"🔍 [DEBUG] 최종 생성 설정: {generate_kwargs}")
+                    
+                    # generated_ids = model.generate(**generate_kwargs)
+                                        
+                    print(f"🔍 [DEBUG] 모델 생성 완료 시간: {time.time()}")
+            
+            t_gen_end = time.time()
+            print(f"🔍 [DEBUG] 모델 추론 완료 - 소요시간: {t_gen_end - t_gen_start:.3f}초")
+            print(f"🔍 [DEBUG] 생성된 토큰 수: {generated_ids.shape[1] - input_ids.shape[1]}")
+            print(f"🔍 [DEBUG] 최종 generated_ids shape: {generated_ids.shape}")
+            print(f"🔍 [DEBUG] 최종 generated_ids 디바이스: {generated_ids.device}")
+            print(f"🔍 [DEBUG] 최종 generated_ids dtype: {generated_ids.dtype}")
             
         except Exception as e:
-            logger.error(f"로컬 모델 로드 실패: {e}")
-            logger.info("테스트용 간단한 모델 로드 중...")
+            print(f"❌ [DEBUG] 모델 추론 중 에러 발생: {str(e)}")
+            print(f"❌ [DEBUG] 에러 타입: {type(e).__name__}")
+            print(f"❌ [DEBUG] 에러 상세: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            return {"error": f"Generation failed: {str(e)}"}
+        
+        # --- 6. 응답 추출 ---
+        print(f"🔍 [DEBUG] 응답 추출 시작")
+        t_decode_start = time.time()
+        
+        try:
+            # 생성된 텍스트 디코딩
+            full_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+            print(f"🔍 [DEBUG] 전체 텍스트 길이: {len(full_text)}")
+            print(f"🔍 [DEBUG] 전체 생성 텍스트 (Raw): \n---\n{full_text}\n---")
+            print(f"🔍 [DEBUG] 사용된 프롬프트: {formatted_prompt}")
+            
+            # 프로필별 응답 추출 (안전한 방식)
+            if hasattr(current_profile, 'extract_response'):
+                try:
+                    response = current_profile.extract_response(full_text, formatted_prompt)
+                    print(f"🔍 [DEBUG] 프로필 extract_response 사용 성공")
+                except Exception as extract_error:
+                    print(f"⚠️ [DEBUG] 프로필 extract_response 실패: {extract_error}")
+                    # 폴백: 기본 응답 추출
+                    response = full_text.replace(formatted_prompt, "").strip() if formatted_prompt else full_text
+                    print(f"🔍 [DEBUG] 기본 응답 추출 사용 (폴백)")
+            else:
+                # 기본 응답 추출
+                response = full_text.replace(formatted_prompt, "").strip() if formatted_prompt else full_text
+                print(f"🔍 [DEBUG] 기본 응답 추출 사용")
             
-            # DialoGPT-medium으로 대체 (더 작은 모델)
-            test_model_name = "microsoft/DialoGPT-medium"
-            tokenizer = AutoTokenizer.from_pretrained(test_model_name)
-            model = AutoModelForCausalLM.from_pretrained(test_model_name)
+
+            
+            print(f"🔍 [DEBUG] 추출된 응답 길이: {len(response)}")
+            print(f"🔍 [DEBUG] 최종 응답: {response}")
             
-            return model, tokenizer
+            t_decode_end = time.time()
+            print(f"🔍 [DEBUG] 응답 추출 완료 - 소요시간: {t_decode_end - t_decode_start:.3f}초")
+            
+        except Exception as e:
+            print(f"❌ [DEBUG] 응답 추출 중 에러 발생: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            return {"error": f"Response extraction failed: {str(e)}"}
+        
+        # --- 7. 결과 반환 ---
+        total_time = time.time() - t_tok_start
+        print(f"🔍 [DEBUG] 전체 처리 완료 - 총 소요시간: {total_time:.3f}초")
+        
+        # 🔄 이미지 처리 완료 (전역 변수 초기화는 제거됨)
+        
+        return {
+            "generated_text": response,
+            "processing_time": total_time,
+            "model_name": current_profile.display_name,
+            "image_processed": image_processed,
+            "tokens_generated": generated_ids.shape[1] - input_ids.shape[1],
+            "total_tokens": generated_ids.shape[1]
+        }
+        
+    except Exception as e:
+        print(f"❌ [DEBUG] generate_sync 전체 에러: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return {"error": str(e)}
+
+@app.get("/lora/status")
+async def get_lora_status():
+    """현재 LoRA 상태 확인"""
+    try:
+        if not LORA_AVAILABLE or lora_manager is None:
+            return {"status": "error", "message": "LoRA 기능이 사용 불가능합니다"}
+        
+        return {
+            "status": "success",
+            "lora_available": True,
+            "current_adapter": lora_manager.current_adapter_name if hasattr(lora_manager, 'current_adapter_name') else None,
+            "base_model_loaded": hasattr(lora_manager, 'base_model') and lora_manager.base_model is not None,
+            "device": getattr(lora_manager, 'device', 'unknown')
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/status")
+async def get_context_status():
+    """컨텍스트 관리자 상태 확인"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        # 세션별 정보 수집
+        session_info = {}
+        for session_id, conversation in context_manager.session_conversations.items():
+            session_info[session_id] = {
+                "turns": len(conversation),
+                "user_messages": len([t for t in conversation if t.role == "user"]),
+                "assistant_messages": len([t for t in conversation if t.role == "assistant"])
+            }
+        
+        return {
+            "status": "success",
+            "context_manager_available": True,
+            "total_sessions": len(context_manager.session_conversations),
+            "sessions": session_info,
+            "max_tokens": context_manager.max_tokens,
+            "max_turns": context_manager.max_turns,
+            "strategy": context_manager.strategy
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/history")
+async def get_context_history(session_id: str = None):
+    """컨텍스트 히스토리 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        if session_id:
+            # 특정 세션의 컨텍스트만 조회
+            context = context_manager.get_context(include_system=True, max_length=4000, session_id=session_id)
+            session_summary = context_manager.get_context_summary(session_id)
+            return {
+                "status": "success",
+                "session_id": session_id,
+                "context": context,
+                "history_length": session_summary.get("total_turns", 0),
+                "session_summary": session_summary
+            }
+        else:
+            # 전체 컨텍스트 조회
+            context = context_manager.get_context(include_system=True, max_length=4000)
+            return {
+                "status": "success",
+                "context": context,
+                "history_length": len(context_manager.conversation_history),
+                "all_sessions": True
+            }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/auto-cleanup")
+async def get_auto_cleanup_config():
+    """자동 정리 설정 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        config = context_manager.get_auto_cleanup_config()
+        return {
+            "status": "success",
+            "auto_cleanup_config": config
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/auto-cleanup")
+async def set_auto_cleanup_config(
+    enabled: bool = Form(True),
+    interval_turns: int = Form(8),
+    interval_time: int = Form(300),
+    strategy: str = Form("smart")
+):
+    """자동 정리 설정 변경"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        context_manager.set_auto_cleanup_config(
+            enabled=enabled,
+            interval_turns=interval_turns,
+            interval_time=interval_time,
+            strategy=strategy
+        )
+        
+        return {
+            "status": "success",
+            "message": "자동 정리 설정이 업데이트되었습니다",
+            "new_config": context_manager.get_auto_cleanup_config()
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/cleanup/{session_id}")
+async def manual_cleanup_session(session_id: str):
+    """특정 세션 수동 정리"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        # 수동 정리 실행
+        context_manager._execute_auto_cleanup(session_id)
+        
+        return {
+            "status": "success",
+            "message": f"세션 {session_id} 수동 정리 완료",
+            "session_id": session_id
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/cleanup-all")
+async def manual_cleanup_all_sessions():
+    """모든 세션 수동 정리"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        # 모든 세션에 대해 수동 정리 실행
+        for session_id in context_manager.session_conversations.keys():
+            context_manager._execute_auto_cleanup(session_id)
+        
+        return {
+            "status": "success",
+            "message": "모든 세션 수동 정리 완료"
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/generate", response_model=GenerateResponse)
+async def generate(request: Request,
+                  prompt: str = Form(...), 
+                  image1: UploadFile = File(None),
+                  image2: UploadFile = File(None),
+                  image3: UploadFile = File(None),
+                  image4: UploadFile = File(None),
+                  user_id: str = Form("anonymous"),
+                  room_id: str = Form("default"),
+                  use_context: bool = Form(True),
+                  session_id: str = Form(None)):
     
-    # 별도 스레드에서 모델 로딩
-    loop = asyncio.get_event_loop()
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        model, tokenizer = await loop.run_in_executor(executor, load_model_sync)
+    if not model_loaded:
+        raise HTTPException(status_code=503, detail="모델이 로드되지 않았습니다.")
+
+    start_time = time.time()
+
+    # 세션 ID가 없으면 자동 생성 (채팅방별 고유 세션)
+    if not session_id:
+        # 채팅방 + 사용자 + 타임스탬프 기반으로 고유한 세션 생성
+        timestamp = int(time.time())
+        session_id = f"room_{room_id}_user_{user_id}_{timestamp}"
+        print(f"🔍 [DEBUG] 자동 세션 ID 생성: {session_id} (채팅방: {room_id}, 사용자: {user_id})")
+    
+    if use_context:
+        context_manager.add_user_message(prompt, metadata={"session_id": session_id})
+        print(f"🔍 [DEBUG] 사용자 메시지 추가됨 (세션: {session_id})")
+
+    # 이미지 데이터 처리
+    image_data_list = []
+    for img_file in [image1, image2, image3, image4]:
+        if img_file:
+            try:
+                data = await img_file.read()
+                image_data_list.append(data)
+            except Exception as e:
+                logger.warning(f"이미지 로드 실패: {e}")
+
+    try:
+        # generate_sync 함수 호출 (컨텍스트 포함)
+        result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id)
+        
+        if "error" in result:
+            raise HTTPException(status_code=500, detail=result["error"])
+        
+        if use_context:
+            context_manager.add_assistant_message(result["generated_text"], metadata={"session_id": session_id})
+
+        return GenerateResponse(
+            generated_text=result["generated_text"],
+            processing_time=result["processing_time"],
+            model_name=result["model_name"],
+            image_processed=result["image_processed"]
+        )
+        
+    except Exception as e:
+        logger.error(f"❌ 생성 중 오류 발생: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"모델 생성 중 오류 발생: {str(e)}")
+
+
+@app.post("/generate-multimodal", response_model=MultimodalGenerateResponse)
+async def generate_multimodal(prompt: str = Form(...),
+                             image: UploadFile = File(None),
+                             model_id: Optional[str] = Form(None),
+                             max_length: Optional[int] = Form(None),
+                             temperature: Optional[float] = Form(None),
+                             top_p: Optional[float] = Form(None),
+                             do_sample: Optional[bool] = Form(None)):
+    global model_loaded, current_profile, model, tokenizer, processor
+
+    if not model_loaded:
+        raise HTTPException(status_code=500, detail="모델이 로드되지 않았습니다")
+
+    start_time = time.time()
+
+    pil_image = None
+    if image:
+        try:
+            data = await image.read()
+            pil_image = Image.open(io.BytesIO(data)).convert("RGB")
+        except Exception as e:
+            logger.error(f"이미지 처리 실패: {e}")
+
+    try:
+        image_list = [pil_image] if pil_image else []
+        image_tokens = " ".join(["<image>"] * len(image_list)) if image_list else ""
+        conv = []
+        if image_list:
+            conv.append({"role": "user", "content": image_tokens})
+        conv.append({"role": "user", "content": prompt})
+        
+        logger.info("=== STEP 1: building sample ===")
+        sample = {"image": [], "conv": [{"role": "user", "content": prompt}]}
+
+        logger.info("=== STEP 2: calling processor ===")
+        inputs = processor.batch_encode_collate([sample], padding_side='left', add_generation_prompt=True)
+        logger.info("=== STEP 3: processor returned ===")
+
+        for k, v in inputs.items():
+            if isinstance(v, torch.Tensor):
+                logger.info(f"Key {k}: tensor shape {v.shape}, dtype {v.dtype}, device {v.device}")
+            else:
+                logger.info(f"Key {k}: {type(v)}")
+
+        logger.info("=== STEP 4: moving to device ===")
+        inputs = {k: (v.to(model.device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()}
+        logger.info("=== STEP 5: moved to device ===")
+        eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
+
+        # Manual greedy decoding loop
+        generated = inputs["input_ids"].clone()
+        for _ in range(64):
+            # 🔄 torch import 문제 해결
+            import torch
+            with torch.no_grad():
+                out = model(**inputs)
+            next_token = out.logits[:, -1, :].argmax(dim=-1, keepdim=True)
+            generated = torch.cat([generated, next_token], dim=-1)
+            logger.info(f"Step token: {next_token.item()}")
+            if next_token.item() == eot_id:
+                break
+            inputs["input_ids"] = generated
+
+        logger.info(f"Final Generated IDs: {generated[0].tolist()}")
 
+        generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
+        if "<|im_start|>assistant" in generated_text:
+            response = generated_text.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip()
+        else:
+            response = generated_text.strip()
+
+        processing_time = time.time() - start_time
+        return MultimodalGenerateResponse(generated_text=response,
+                                          processing_time=processing_time,
+                                          model_name=current_profile.display_name,
+                                          model_id=model_id or current_profile.get_model_info().get("model_name"),
+                                          image_processed=bool(pil_image))
+    except Exception as e:
+        logger.error(f"❌ 멀티모달 생성 오류: {e}")
+        raise HTTPException(status_code=500, detail=f"멀티모달 생성 실패: {str(e)}")
+
+
+
+
+@app.get("/models")
+async def list_models():
+    """사용 가능한 모델 목록"""
+    return {
+        "models": list_available_models(),
+        "current_model": current_profile.get_model_info() if current_profile else None
+    }
+
+@app.post("/switch-model")
+async def switch_model(model_id: str):
+    """모델 변경"""
+    try:
+        await load_model_async(model_id)
+        return {
+            "message": f"모델 변경 성공: {model_id}",
+            "current_model": current_profile.display_name
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"모델 변경 실패: {str(e)}")
+    
 @app.get("/", response_model=dict)
 async def root():
     """루트 엔드포인트"""
     return {
-        "message": "Lily LLM API 서버",
-        "version": "1.0.0",
-        "model": "Mistral-7B-Instruct-v0.2 (Fine-tuned)",
+        "message": "Lily LLM API v2 서버",
+        "version": "2.0.0",
+        "current_model": current_profile.display_name if current_profile else "None",
         "docs": "/docs"
     }
 
 @app.get("/health", response_model=HealthResponse)
 async def health_check():
     """헬스 체크 엔드포인트"""
+    available_models = list_available_models()
+    
     return HealthResponse(
         status="healthy",
         model_loaded=model_loaded,
-        model_name="Lily LLM (Mistral-7B)"
+        current_model=current_profile.display_name if current_profile else "None",
+        available_models=available_models
     )
 
-@app.post("/generate", response_model=GenerateResponse)
-async def generate_text(request: GenerateRequest):
-    """텍스트 생성 엔드포인트"""
-    global model, tokenizer
-    
-    if not model_loaded or model is None or tokenizer is None:
-        raise HTTPException(status_code=503, detail="모델이 로드되지 않았습니다")
-    
+@app.post("/document/upload", response_model=DocumentUploadResponse)
+async def upload_document(
+    file: UploadFile = File(...),
+    user_id: str = Form("default_user"),  # 기본 사용자 ID
+    room_id: str = Form("default"),  # 채팅방 ID
+    document_id: Optional[str] = Form(None)  # 문서 ID (자동 생성 가능)
+):
+    """문서 업로드 및 RAG 처리"""
     start_time = time.time()
+    # document_id = None
     
     try:
-        logger.info(f"텍스트 생성 시작: '{request.prompt}'")
-        
-        # polyglot 모델에 맞는 프롬프트 형식으로 수정
-        formatted_prompt = f"질문: {request.prompt}\n답변:"
-        logger.info(f"포맷된 프롬프트: '{formatted_prompt}'")
-        
-        # 입력 토크나이징 - padding 제거하고 패딩 토큰 설정
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            
-        inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True)
-        logger.info(f"입력 토큰 수: {inputs['input_ids'].shape[1]}")
-        
-        # 텍스트 생성 - 더 강력한 설정으로 수정
-        with torch.no_grad():
-            outputs = model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                max_new_tokens=request.max_length,
-                do_sample=True,
-                temperature=0.9,  # 더 높은 temperature
-                top_k=50,         # top_k 추가
-                top_p=0.95,       # top_p 추가
-                repetition_penalty=1.2,  # 반복 방지
-                no_repeat_ngram_size=2,  # n-gram 반복 방지
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
-            )
+        # 문서 ID 생성 (제공되지 않은 경우)
+        if not document_id:
+            import uuid
+            document_id = str(uuid.uuid4())[:8]
         
-        logger.info(f"생성된 토큰 수: {outputs.shape[1]}")
+        # 임시 파일 저장
+        temp_file_path = f"./temp_{document_id}_{file.filename}"
+        with open(temp_file_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
         
-        # 결과 디코딩
-        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        logger.info(f"디코딩된 전체 텍스트: '{generated_text}'")
-        
-        # polyglot 응답 부분만 추출
-        if "답변:" in generated_text:
-            response = generated_text.split("답변:")[-1].strip()
-            logger.info(f"답변 추출: '{response}'")
-        else:
-            # 기존 방식으로 프롬프트 제거
-            if formatted_prompt in generated_text:
-                response = generated_text.replace(formatted_prompt, "").strip()
-            else:
-                response = generated_text.strip()
-            logger.info(f"프롬프트 제거 후: '{response}'")
+        # 문서 처리 및 벡터 스토어에 저장
+        result = rag_processor.process_and_store_document(
+            user_id, document_id, temp_file_path
+        )
         
-        # 빈 응답 처리
-        if not response.strip():
-            logger.warning("생성된 텍스트가 비어있음, 기본 응답 사용")
-            response = "안녕하세요! 무엇을 도와드릴까요?"
+        # 임시 파일 삭제
+        import os
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
         
         processing_time = time.time() - start_time
+        logger.info(f"📄 문서 업로드 완료 ({processing_time:.2f}초): {file.filename}")
         
-        logger.info(f"생성 완료: {processing_time:.2f}초, 텍스트 길이: {len(response)}")
+        # 새로운 메모리 시스템에 문서 정보 추가
+        if result["success"]:
+            try:
+                # 문서 정보를 채팅방 컨텍스트에 추가
+                chunks = result.get("chunks", [])
+                chunk_count = len(chunks) if isinstance(chunks, list) else 0
+                
+                document_info = {
+                    "document_id": document_id,
+                    "filename": file.filename,
+                    "uploaded_by": user_id,
+                    "document_type": file.filename.split('.')[-1].lower() if '.' in file.filename else "unknown",
+                    "page_count": result.get("page_count", 0),
+                    "chunk_count": chunk_count,
+                    "summary": result.get("message", "")
+                }
+                
+                # 통합 메모리 관리자에 문서 추가
+                integrated_memory_manager.add_document_to_room(room_id, document_info)
+                
+                # 사용자 통계 업데이트
+                integrated_memory_manager.record_conversation(
+                    user_id, room_id, 
+                    topic=f"문서 업로드: {file.filename}"
+                )
+                
+                logger.info(f"✅ 메모리 시스템에 문서 정보 추가 완료: {room_id} - {file.filename}")
+                
+            except Exception as e:
+                logger.warning(f"⚠️ 메모리 시스템 업데이트 실패: {e}")
         
-        return GenerateResponse(
-            generated_text=response,
-            processing_time=processing_time
+        # 문서 업로드 후 자동 AI 응답 생성 비활성화 (AI 리소스 절약)
+        # 사용자가 직접 질문할 때만 AI 응답 생성
+        auto_generate_response = False
+        
+        if result["success"]:
+            # 자동 AI 요약 없이 문서 업로드만 완료
+            result["auto_response"] = f"문서 '{file.filename}' 업로드 완료! 이제 질문해주세요."
+            logger.info(f"📄 자동 AI 응답 생성 건너뛰기 - AI 리소스 절약 (사용자 질문 시에만 AI 응답)")
+        else:
+            result["auto_response"] = "문서 업로드에 실패했습니다."
+        
+        return DocumentUploadResponse(
+            success=result["success"],
+            document_id=document_id,
+            message=result.get("message", ""),
+            chunks=result.get("chunks"),
+            latex_count=result.get("latex_count"),
+            error=result.get("error"),
+            auto_response=result.get("auto_response", "")  # 자동 응답 추가
         )
         
     except Exception as e:
-        logger.error(f"텍스트 생성 오류: {e}")
-        raise HTTPException(status_code=500, detail=f"텍스트 생성 실패: {str(e)}")
+        logger.error(f"❌ 문서 업로드 실패: {e}")
+        return DocumentUploadResponse(
+            success=False,
+            document_id=document_id if 'document_id' in locals() else "unknown",
+            message="문서 업로드 중 오류가 발생했습니다.",
+            error=str(e)
+        )
 
-@app.get("/models")
-async def list_models():
-    """사용 가능한 모델 목록"""
+@app.post("/summarize/conversation")
+async def summarize_conversation(
+    room_id: str = Form("default"),
+    user_id: str = Form("anonymous"),
+    max_length: int = Form(300)
+):
+    """🔄 summarizers를 활용한 대화 요약 생성"""
+    try:
+        if not text_summarizer.is_available():
+            return {
+                "success": False,
+                "message": "summarizers 라이브러리를 사용할 수 없습니다."
+            }
+        
+        # 스마트 대화 요약 생성
+        success = integrated_memory_manager.create_smart_conversation_summary(
+            room_id, max_length
+        )
+        
+        if success:
+            # 업데이트된 컨텍스트 조회
+            room_context = integrated_memory_manager.room_context_manager.get_room_context(room_id)
+            
+            return {
+                "success": True,
+                "message": "대화 요약 생성 완료",
+                "summary": room_context.conversation_summary if room_context else "",
+                "key_topics": room_context.key_topics if room_context else [],
+                "room_id": room_id
+            }
+        else:
+            return {
+                "success": False,
+                "message": "대화 요약 생성 실패"
+            }
+            
+    except Exception as e:
+        logger.error(f"❌ 대화 요약 생성 실패: {e}")
+        return {
+            "success": False,
+            "message": f"대화 요약 생성 중 오류 발생: {str(e)}"
+        }
+
+@app.post("/summarize/text")
+async def summarize_text(
+    text: str = Form(...),
+    max_length: int = Form(200),
+    model_name: str = Form("kobart")
+):
+    """🔄 summarizers를 활용한 텍스트 요약"""
+    try:
+        if not text_summarizer.is_available():
+            return {
+                "success": False,
+                "message": "summarizers 라이브러리를 사용할 수 없습니다."
+            }
+        
+        if not text or len(text.strip()) < 50:
+            return {
+                "success": False,
+                "message": "요약할 텍스트가 너무 짧습니다 (최소 50자 필요)"
+            }
+        
+        # 요약 설정
+        config = SummaryConfig(
+            max_length=max_length,
+            min_length=max_length // 2,
+            do_sample=False,
+            temperature=0.7,
+            top_p=0.9
+        )
+        
+        # 요약 수행
+        summary = text_summarizer.summarize_text(text, model_name, config)
+        
+        if summary:
+            compression_ratio = len(summary) / len(text)
+            return {
+                "success": True,
+                "message": "텍스트 요약 완료",
+                "original_length": len(text),
+                "summary_length": len(summary),
+                "compression_ratio": round(compression_ratio, 2),
+                "summary": summary,
+                "model_used": model_name
+            }
+        else:
+            return {
+                "success": False,
+                "message": "요약 생성 실패"
+            }
+            
+    except Exception as e:
+        logger.error(f"❌ 텍스트 요약 실패: {e}")
+        return {
+            "success": False,
+            "message": f"텍스트 요약 중 오류 발생: {str(e)}"
+        }
+
+@app.post("/compress/context")
+async def compress_context(
+    room_id: str = Form("default"),
+    target_length: int = Form(800)
+):
+    """🔄 채팅방 컨텍스트 압축"""
+    try:
+        if not text_summarizer.is_available():
+            return {
+                "success": False,
+                "message": "summarizers 라이브러리를 사용할 수 없습니다."
+            }
+        
+        # 컨텍스트 압축 수행
+        success = integrated_memory_manager.compress_room_context(room_id, target_length)
+        
+        if success:
+            return {
+                "success": True,
+                "message": "컨텍스트 압축 완료",
+                "room_id": room_id,
+                "target_length": target_length
+            }
+        else:
+            return {
+                "success": False,
+                "message": "컨텍스트 압축 실패"
+            }
+            
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 압축 실패: {e}")
+        return {
+            "success": False,
+            "message": f"컨텍스트 압축 중 오류 발생: {str(e)}"
+        }
+
+@app.get("/summarizer/status")
+async def get_summarizer_status():
+    """🔄 summarizers 라이브러리 상태 확인"""
+    try:
+        available = text_summarizer.is_available()
+        models = text_summarizer.get_available_models() if available else []
+        
+        return {
+            "success": True,
+            "summarizers_available": available,
+            "available_models": models,
+            "default_model": "hyunwoongko/kobart" if available else None
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ summarizer 상태 확인 실패: {e}")
+        return {
+            "success": False,
+            "message": f"상태 확인 중 오류 발생: {str(e)}"
+        }        
+
+@app.post("/rag/generate", response_model=RAGResponse)
+async def generate_rag_response(
+    query: str = Form(...),
+    user_id: str = Form("default_user"),
+    document_id: str = Form(...),
+    max_length: Optional[int] = Form(None),
+    temperature: Optional[float] = Form(None),
+    top_p: Optional[float] = Form(None),
+    do_sample: Optional[bool] = Form(None)
+):
+    """RAG 기반 응답 생성"""
+    start_time = time.time()
+    
+    try:
+        # 모델이 로드되었는지 확인
+        llm_model = None
+        if model is not None and hasattr(model, 'generate_text'):
+            llm_model = model
+            logger.info("✅ 로드된 모델을 사용하여 RAG 응답 생성")
+        else:
+            logger.warning("⚠️ 모델이 로드되지 않아 텍스트 기반 응답만 생성")
+        
+        # RAG 응답 생성
+        result = rag_processor.generate_rag_response(
+            user_id, document_id, query, llm_model=llm_model
+        )
+        
+        processing_time = time.time() - start_time
+        logger.info(f"🔍 RAG 응답 생성 완료 ({processing_time:.2f}초)")
+        
+        return RAGResponse(
+            success=result["success"],
+            response=result["response"],
+            context=result["context"],
+            sources=result["sources"],
+            search_results=result["search_results"],
+            processing_time=processing_time
+        )
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 응답 생성 실패: {e}")
+        return RAGResponse(
+            success=False,
+            response=f"RAG 응답 생성 중 오류가 발생했습니다: {str(e)}",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=0.0
+        )
+
+@app.post("/rag/generate-hybrid", response_model=RAGResponse)
+async def generate_hybrid_rag_response(
+    query: str = Form(...),
+    user_id: str = Form("default_user"),
+    document_id: str = Form(...),
+    image1: UploadFile = File(None),
+    image2: UploadFile = File(None),
+    image3: UploadFile = File(None),
+    image4: UploadFile = File(None),
+    image5: UploadFile = File(None),
+    max_length: Optional[int] = Form(None),
+    temperature: Optional[float] = Form(None),
+    top_p: Optional[float] = Form(None),
+    do_sample: Optional[bool] = Form(None)
+):
+    """하이브리드 RAG 기반 응답 생성 (이미지 + 문서)"""
+    start_time = time.time()
+    
+    try:
+        # 이미지 파일 처리
+        image_files = []
+        uploaded_images = [image1, image2, image3, image4, image5]
+        
+        for i, img in enumerate(uploaded_images):
+            if img:
+                try:
+                    # 임시 파일로 저장
+                    import tempfile
+                    with tempfile.NamedTemporaryFile(
+                        suffix=f"_{i}.png", 
+                        delete=False, 
+                        prefix="hybrid_image_"
+                    ) as temp_file:
+                        image_data = await img.read()
+                        temp_file.write(image_data)
+                        image_files.append(temp_file.name)
+                        logger.info(f"📸 이미지 업로드: {img.filename} -> {temp_file.name}")
+                except Exception as e:
+                    logger.error(f"❌ 이미지 처리 실패: {e}")
+        
+        # RAG 응답 생성 (이미지 포함)
+        result = rag_processor.generate_rag_response(
+            user_id, document_id, query, 
+            llm_model=model,  # 실제 모델 인스턴스 사용
+            image_files=image_files if image_files else None
+        )
+        
+        # 임시 이미지 파일 정리
+        for temp_file in image_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.remove(temp_file)
+                    logger.info(f"🗑️ 임시 이미지 파일 삭제: {temp_file}")
+            except Exception as e:
+                logger.warning(f"⚠️ 임시 파일 삭제 실패: {e}")
+        
+        processing_time = time.time() - start_time
+        logger.info(f"🔍 하이브리드 RAG 응답 생성 완료 ({processing_time:.2f}초)")
+        
+        return RAGResponse(
+            success=result["success"],
+            response=result["response"],
+            context=result["context"],
+            sources=result["sources"],
+            search_results=result["search_results"],
+            processing_time=processing_time
+        )
+        
+    except Exception as e:
+        logger.error(f"❌ 하이브리드 RAG 응답 생성 실패: {e}")
+        return RAGResponse(
+            success=False,
+            response=f"응답 생성 중 오류가 발생했습니다: {str(e)}",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=time.time() - start_time
+        )
+
+@app.get("/documents/{user_id}")
+async def list_user_documents(user_id: str):
+    """사용자의 문서 목록 조회"""
+    try:
+        from lily_llm_core.vector_store_manager import vector_store_manager
+        documents = vector_store_manager.get_all_documents(user_id)
+        return documents
+    except Exception as e:
+        logger.error(f"❌ 문서 목록 조회 실패: {e}")
+        return {"documents": [], "total_docs": 0, "error": str(e)}
+
+@app.delete("/document/{user_id}/{document_id}")
+async def delete_document(user_id: str, document_id: str):
+    """문서 삭제"""
+    try:
+        result = rag_processor.delete_document(user_id, document_id)
+        return result
+    except Exception as e:
+        logger.error(f"❌ 문서 삭제 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+# 사용자 관리 엔드포인트
+@app.post("/user/create", response_model=UserResponse)
+async def create_user(
+    user_id: str = Form(...),
+    username: Optional[str] = Form(None),
+    email: Optional[str] = Form(None)
+):
+    """사용자 생성"""
+    try:
+        success = db_manager.add_user(user_id, username, email)
+        if success:
+            user_info = db_manager.get_user(user_id)
+            return UserResponse(
+                success=True,
+                user_id=user_id,
+                username=user_info.get('username') if user_info else None,
+                email=user_info.get('email') if user_info else None,
+                created_at=user_info.get('created_at') if user_info else None
+            )
+        else:
+            return UserResponse(success=False, user_id=user_id, error="사용자 생성 실패")
+    except Exception as e:
+        logger.error(f"❌ 사용자 생성 오류: {e}")
+        return UserResponse(success=False, user_id=user_id, error=str(e))
+
+@app.get("/user/{user_id}", response_model=UserResponse)
+async def get_user_info(user_id: str):
+    """사용자 정보 조회"""
+    try:
+        user_info = db_manager.get_user(user_id)
+        if user_info:
+            return UserResponse(
+                success=True,
+                user_id=user_id,
+                username=user_info.get('username'),
+                email=user_info.get('email'),
+                created_at=user_info.get('created_at')
+            )
+        else:
+            return UserResponse(success=False, user_id=user_id, error="사용자를 찾을 수 없습니다")
+    except Exception as e:
+        logger.error(f"❌ 사용자 조회 오류: {e}")
+        return UserResponse(success=False, user_id=user_id, error=str(e))
+
+# 세션 관리 엔드포인트
+@app.post("/session/create", response_model=SessionResponse)
+async def create_session(
+    user_id: str = Form(...),
+    session_name: Optional[str] = Form(None)
+):
+    """채팅 세션 생성"""
+    try:
+        session_id = db_manager.create_chat_session(user_id, session_name)
+        if session_id:
+            return SessionResponse(
+                success=True,
+                session_id=session_id,
+                session_name=session_name
+            )
+        else:
+            return SessionResponse(success=False, session_id="", error="세션 생성 실패")
+    except Exception as e:
+        logger.error(f"❌ 세션 생성 오류: {e}")
+        return SessionResponse(success=False, session_id="", error=str(e))
+
+@app.get("/sessions/{user_id}")
+async def list_user_sessions(user_id: str):
+    """사용자의 세션 목록 조회"""
+    try:
+        sessions = db_manager.get_user_sessions(user_id)
+        return {
+            "success": True,
+            "user_id": user_id,
+            "sessions": sessions,
+            "total_sessions": len(sessions)
+        }
+    except Exception as e:
+        logger.error(f"❌ 세션 목록 조회 오류: {e}")
+        return {"success": False, "error": str(e)}
+
+# 채팅 메시지 엔드포인트
+@app.post("/chat/message", response_model=ChatMessageResponse)
+async def add_chat_message(
+    session_id: str = Form(...),
+    user_id: str = Form(...),
+    message_type: str = Form(...),
+    content: str = Form(...)
+):
+    """채팅 메시지 추가"""
+    try:
+        success = db_manager.add_chat_message(session_id, user_id, message_type, content)
+        if success:
+            return ChatMessageResponse(
+                success=True,
+                message_id=0,  # 실제 ID는 DB에서 자동 생성
+                content=content,
+                message_type=message_type,
+                timestamp=datetime.now().isoformat()
+            )
+        else:
+            return ChatMessageResponse(
+                success=False,
+                message_id=0,
+                content="",
+                message_type="",
+                timestamp="",
+                error="메시지 추가 실패"
+            )
+    except Exception as e:
+        logger.error(f"❌ 메시지 추가 오류: {e}")
+        return ChatMessageResponse(
+            success=False,
+            message_id=0,
+            content="",
+            message_type="",
+            timestamp="",
+            error=str(e)
+        )
+
+@app.get("/chat/history/{session_id}")
+async def get_chat_history(session_id: str, limit: int = 50):
+    """채팅 히스토리 조회"""
+    try:
+        messages = db_manager.get_chat_history(session_id, limit)
+        return {
+            "success": True,
+            "session_id": session_id,
+            "messages": messages,
+            "total_messages": len(messages)
+        }
+    except Exception as e:
+        logger.error(f"❌ 채팅 히스토리 조회 오류: {e}")
+        return {"success": False, "error": str(e)}
+
+# 문서 관리 엔드포인트 (DB 연동)
+@app.get("/documents/db/{user_id}")
+async def list_user_documents_db(user_id: str):
+    """사용자의 문서 목록 조회 (DB 기반)"""
+    try:
+        documents = db_manager.get_user_documents(user_id)
+        return {
+            "success": True,
+            "user_id": user_id,
+            "documents": documents,
+            "total_documents": len(documents)
+        }
+    except Exception as e:
+        logger.error(f"❌ 문서 목록 조회 오류: {e}")
+        return {"success": False, "error": str(e)}
+
+# 인증 엔드포인트
+@app.post("/auth/login", response_model=LoginResponse)
+async def login(
+    user_id: str = Form(...),
+    password: str = Form(...)
+):
+    """사용자 로그인"""
+    try:
+        # 사용자 정보 조회
+        user_info = db_manager.get_user(user_id)
+        if not user_info:
+            return LoginResponse(success=False, error="사용자를 찾을 수 없습니다")
+        
+        # 비밀번호 검증 (간단한 검증 - 실제로는 DB에 저장된 해시와 비교)
+        if not auth_manager.verify_password(password, "dummy_hash"):  # 실제 구현에서는 DB의 해시와 비교
+            return LoginResponse(success=False, error="비밀번호가 올바르지 않습니다")
+        
+        # 토큰 생성
+        tokens = auth_manager.create_user_tokens(user_id, user_info.get('username'))
+        
+        return LoginResponse(
+            success=True,
+            access_token=tokens['access_token'],
+            refresh_token=tokens['refresh_token'],
+            token_type=tokens['token_type'],
+            user_id=user_id,
+            username=user_info.get('username')
+        )
+    except Exception as e:
+        logger.error(f"❌ 로그인 오류: {e}")
+        return LoginResponse(success=False, error=str(e))
+
+@app.post("/auth/refresh", response_model=TokenResponse)
+async def refresh_token(refresh_token: str = Form(...)):
+    """액세스 토큰 갱신"""
+    try:
+        new_access_token = auth_manager.refresh_access_token(refresh_token)
+        
+        if new_access_token:
+            return TokenResponse(
+                success=True,
+                access_token=new_access_token,
+                token_type="bearer"
+            )
+        else:
+            return TokenResponse(success=False, error="유효하지 않은 리프레시 토큰입니다")
+    except Exception as e:
+        logger.error(f"❌ 토큰 갱신 오류: {e}")
+        return TokenResponse(success=False, error=str(e))
+
+@app.post("/auth/register", response_model=LoginResponse)
+async def register(
+    user_id: str = Form(...),
+    username: str = Form(...),
+    password: str = Form(...),
+    email: Optional[str] = Form(None)
+):
+    """사용자 등록"""
+    try:
+        # 기존 사용자 확인
+        existing_user = db_manager.get_user(user_id)
+        if existing_user:
+            return LoginResponse(success=False, error="이미 존재하는 사용자 ID입니다")
+        
+        # 비밀번호 해싱
+        hashed_password = auth_manager.hash_password(password)
+        
+        # 사용자 생성 (실제 구현에서는 hashed_password를 DB에 저장)
+        success = db_manager.add_user(user_id, username, email)
+        
+        if success:
+            # 토큰 생성
+            tokens = auth_manager.create_user_tokens(user_id, username)
+            
+            return LoginResponse(
+                success=True,
+                access_token=tokens['access_token'],
+                refresh_token=tokens['refresh_token'],
+                token_type=tokens['token_type'],
+                user_id=user_id,
+                username=username
+            )
+        else:
+            return LoginResponse(success=False, error="사용자 등록에 실패했습니다")
+    except Exception as e:
+        logger.error(f"❌ 사용자 등��� 오류: {e}")
+        return LoginResponse(success=False, error=str(e))
+
+@app.get("/auth/me")
+async def get_current_user_info(credentials: HTTPAuthorizationCredentials = Depends(auth_manager.security)):
+    """현재 사용자 정보 조회"""
+    try:
+        user_info = auth_manager.get_current_user(credentials)
+        return {
+            "success": True,
+            "user_id": user_info.get("sub"),
+            "username": user_info.get("username"),
+            "token_type": user_info.get("type")
+        }
+    except Exception as e:
+        logger.error(f"❌ 사용자 정보 조회 오류: {e}")
+        return {"success": False, "error": str(e)}
+    
+# WebSocket 실시간 채팅 엔드포인트
+@app.websocket("/ws/{user_id}")
+async def websocket_endpoint(websocket: WebSocket, user_id: str, session_id: str = None):
+    """WebSocket 실시간 채팅 엔드포인트"""
+    try:
+        # 연결 수락
+        await connection_manager.connect(websocket, user_id, session_id)
+        
+        # 연결 상태 브로드캐스트
+        await connection_manager.broadcast_message({
+            "type": "user_connected",
+            "user_id": user_id,
+            "session_id": session_id,
+            "timestamp": datetime.now().isoformat()
+        }, exclude_user=user_id)
+        
+        # 메시지 수신 루프
+        while True:
+            try:
+                # 메시지 수신
+                data = await websocket.receive_text()
+                message_data = json.loads(data)
+                
+                # 메시지 타입에 따른 처리
+                message_type = message_data.get("type", "chat")
+                
+                if message_type == "chat":
+                    # 채팅 메시지 처리
+                    content = message_data.get("content", "")
+                    session_id = message_data.get("session_id")
+                    
+                    # DB에 메시지 저장
+                    if session_id:
+                        db_manager.add_chat_message(
+                            session_id=session_id,
+                            user_id=user_id,
+                            message_type="user",
+                            content=content
+                        )
+                    
+                    # 세션의 다른 사용자들에게 메시지 전송
+                    await connection_manager.send_session_message({
+                        "type": "chat_message",
+                        "user_id": user_id,
+                        "content": content,
+                        "session_id": session_id,
+                        "timestamp": datetime.now().isoformat()
+                    }, session_id, exclude_user=user_id)
+                    
+                    # AI 응답 생성 (선택적)
+                    if message_data.get("generate_ai_response", False):
+                        # AI 응답 생성 로직
+                        ai_response = await generate_ai_response(content, user_id)
+                        
+                        # AI 응답을 DB에 저장
+                        if session_id:
+                            db_manager.add_chat_message(
+                                session_id=session_id,
+                                user_id="ai_assistant",
+                                message_type="assistant",
+                                content=ai_response
+                            )
+                        
+                        # AI 응답을 세션 사용자들에게 전송
+                        await connection_manager.send_session_message({
+                            "type": "ai_response",
+                            "user_id": "ai_assistant",
+                            "content": ai_response,
+                            "session_id": session_id,
+                            "timestamp": datetime.now().isoformat()
+                        }, session_id)
+                
+                elif message_type == "typing":
+                    # 타이핑 상태 전송
+                    await connection_manager.send_session_message({
+                        "type": "user_typing",
+                        "user_id": user_id,
+                        "session_id": message_data.get("session_id"),
+                        "timestamp": datetime.now().isoformat()
+                    }, message_data.get("session_id"), exclude_user=user_id)
+                
+                elif message_type == "join_session":
+                    # 세션 참여
+                    new_session_id = message_data.get("session_id")
+                    if new_session_id:
+                        # 기존 세션에서 제거
+                        if user_id in connection_manager.connection_info:
+                            old_session_id = connection_manager.connection_info[user_id].get("session_id")
+                            if old_session_id and old_session_id in connection_manager.session_connections:
+                                connection_manager.session_connections[old_session_id].discard(user_id)
+                        
+                        # 새 세션에 추가
+                        if new_session_id not in connection_manager.session_connections:
+                            connection_manager.session_connections[new_session_id] = set()
+                        connection_manager.session_connections[new_session_id].add(user_id)
+                        
+                        # 연결 정보 업데이트
+                        if user_id in connection_manager.connection_info:
+                            connection_manager.connection_info[user_id]["session_id"] = new_session_id
+                        
+                        # 세션 참여 알림
+                        await connection_manager.send_session_message({
+                            "type": "user_joined_session",
+                            "user_id": user_id,
+                            "session_id": new_session_id,
+                            "timestamp": datetime.now().isoformat()
+                        }, new_session_id, exclude_user=user_id)
+                
+                logger.info(f"📨 WebSocket 메시지 처리: {user_id} - {message_type}")
+                
+            except WebSocketDisconnect:
+                logger.info(f"🔌 WebSocket 연결 끊김: {user_id}")
+                break
+            except json.JSONDecodeError:
+                logger.warning(f"⚠️ 잘못된 JSON 형식: {user_id}")
+                await websocket.send_text(json.dumps({
+                    "type": "error",
+                    "message": "잘못된 메시지 형식입니다."
+                }))
+            except Exception as e:
+                logger.error(f"❌ WebSocket 메시지 처리 오류: {e}")
+                await websocket.send_text(json.dumps({
+                    "type": "error",
+                    "message": "메시지 처리 중 오류가 발생했습니다."
+                }))
+    
+    except WebSocketDisconnect:
+        logger.info(f"🔌 WebSocket 연결 끊김: {user_id}")
+    except Exception as e:
+        logger.error(f"❌ WebSocket 엔드포인트 오류: {e}")
+    finally:
+        # 연결 해제
+        connection_manager.disconnect(user_id)
+        
+        # 연결 해제 알림
+        await connection_manager.broadcast_message({
+            "type": "user_disconnected",
+            "user_id": user_id,
+            "timestamp": datetime.now().isoformat()
+        }, exclude_user=user_id)
+
+async def generate_ai_response(content: str, user_id: str) -> str:
+    """AI 응답 생성 (간단한 예시)"""
+    try:
+        # 현재 모델로 응답 생성
+        response = await generate_sync(content, user_id)
+        return response.get("response", "죄송합니다. 응답을 생성할 수 없습니다.")
+    except Exception as e:
+        logger.error(f"❌ AI 응답 생성 실패: {e}")
+        return "죄송합니다. 응답을 생성할 수 없습니다."
+
+# WebSocket 상태 조회 엔드포인트
+@app.get("/ws/status")
+async def get_websocket_status():
+    """WebSocket 연결 상태 조회"""
     return {
-        "models": [
-            {
-                "id": "lily-llm",
-                "name": "Lily LLM",
-                "description": "Hearth Chat용 파인튜닝된 Mistral-7B 모델",
-                "base_model": "mistralai/Mistral-7B-Instruct-v0.2",
-                "fine_tuned": True
-            }
-        ]
+        "active_connections": connection_manager.get_connection_count(),
+        "active_users": connection_manager.get_active_users(),
+        "sessions": list(connection_manager.session_connections.keys())
     }
 
-if __name__ == "__main__":
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=8001,
-        reload=False,
-        log_level="info"
-    ) 
+# Celery 백그라운드 작업 엔드포인트
+@app.post("/tasks/document/process")
+async def start_document_processing(
+    user_id: str = Form(...),
+    document_id: str = Form(...),
+    file_path: str = Form(...),
+    file_type: str = Form(...)
+):
+    """문서 처리 백그라운드 작업 시작"""
+    try:
+        task = process_document_async.delay(user_id, document_id, file_path, file_type)
+        
+        return {
+            "success": True,
+            "task_id": task.id,
+            "status": "started",
+            "message": "문서 처리 작업이 시작되었습니다."
+        }
+    except Exception as e:
+        logger.error(f"❌ 문서 처리 작업 시작 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.post("/tasks/ai/generate")
+async def start_ai_generation(
+    user_id: str = Form(...),
+    session_id: str = Form(...),
+    prompt: str = Form(...),
+    model_id: Optional[str] = Form(None)
+):
+    """AI 응답 생성 백그라운드 작업 시작"""
+    try:
+        task = generate_ai_response_async.delay(user_id, session_id, prompt, model_id)
+        
+        return {
+            "success": True,
+            "task_id": task.id,
+            "status": "started",
+            "message": "AI 응답 생성 작업이 시작되었습니다."
+        }
+    except Exception as e:
+        logger.error(f"❌ AI 응답 생성 작업 시작 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.post("/tasks/rag/query")
+async def start_rag_query(
+    user_id: str = Form(...),
+    query: str = Form(...),
+    document_id: str = Form(...)
+):
+    """RAG 쿼리 백그라운드 작업 시작"""
+    try:
+        task = rag_query_async.delay(user_id, query, document_id)
+        
+        return {
+            "success": True,
+            "task_id": task.id,
+            "status": "started",
+            "message": "RAG 쿼리 작업이 시작되었습니다."
+        }
+    except Exception as e:
+        logger.error(f"❌ RAG 쿼리 작업 시작 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.post("/tasks/documents/batch")
+async def start_batch_processing(
+    user_id: str = Form(...),
+    document_ids: str = Form(...)  # JSON 문자열로 전달
+):
+    """문서 일괄 처리 백그라운드 작업 시작"""
+    try:
+        import json
+        doc_ids = json.loads(document_ids)
+        
+        task = batch_process_documents_async.delay(user_id, doc_ids)
+        
+        return {
+            "success": True,
+            "task_id": task.id,
+            "status": "started",
+            "message": f"문서 일괄 처리 작업이 시작되었습니다. ({len(doc_ids)}개 문서)"
+        }
+    except Exception as e:
+        logger.error(f"❌ 문서 일괄 처리 작업 시작 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.get("/tasks/{task_id}")
+async def get_task_status_endpoint(task_id: str):
+    """작업 상태 조회"""
+    try:
+        status = get_task_status(task_id)
+        
+        if status:
+            return {
+                "success": True,
+                "task_id": task_id,
+                "status": status["status"],
+                "result": status["result"],
+                "info": status["info"]
+            }
+        else:
+            return {
+                "success": False,
+                "error": "작업을 찾을 수 없습니다."
+            }
+    except Exception as e:
+        logger.error(f"❌ 작업 상태 조회 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.delete("/tasks/{task_id}")
+async def cancel_task_endpoint(task_id: str):
+    """작업 취소"""
+    try:
+        success = cancel_task(task_id)
+        
+        if success:
+            return {
+                "success": True,
+                "task_id": task_id,
+                "message": "작업이 취소되었습니다."
+            }
+        else:
+            return {
+                "success": False,
+                "error": "작업 취소에 실패했습니다."
+            }
+    except Exception as e:
+        logger.error(f"❌ 작업 취소 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+# 성능 모니터링 엔드포인트
+@app.post("/monitoring/start")
+async def start_performance_monitoring():
+    """성능 모니터링 시작"""
+    try:
+        performance_monitor.start_monitoring()
+        return {"message": "성능 모니터링이 시작되었습니다."}
+    except Exception as e:
+        logger.error(f"모니터링 시작 실패: {e}")
+        raise HTTPException(status_code=500, detail=f"모니터링 시작 실패: {str(e)}")
+
+@app.post("/monitoring/stop")
+async def stop_performance_monitoring():
+    """성능 모니터링 중지"""
+    try:
+        performance_monitor.stop_monitoring()
+        return {"message": "성능 모니터링이 중지되었습니다."}
+    except Exception as e:
+        logger.error(f"모니터링 중지 실패: {e}")
+        raise HTTPException(status_code=500, detail=f"모니터링 중지 실패: {str(e)}")
+
+@app.get("/monitoring/status")
+async def get_monitoring_status():
+    """모니터링 상태 조회"""
+    try:
+        summary = performance_monitor.get_performance_summary()
+        return summary
+    except Exception as e:
+        logger.error(f"모니터링 상태 조회 실패: {e}")
+        raise HTTPException(status_code=500, detail=f"모니터링 상태 조회 실패: {str(e)}")
+
+@app.get("/monitoring/health")
+async def get_system_health():
+    """시스템 건강 상태 조회"""
+    try:
+        health = performance_monitor.get_system_health()
+        return {
+            "status": health.status,
+            "cpu_health": health.cpu_health,
+            "memory_health": health.memory_health,
+            "disk_health": health.disk_health,
+            "network_health": health.network_health,
+            "recommendations": health.recommendations
+        }
+    except Exception as e:
+        logger.error(f"시스템 건강 상태 조회 실패: {e}")
+        raise HTTPException(status_code=500, detail=f"시스템 건강 상태 조회 실패: {str(e)}")
+
+@app.post("/monitoring/export")
+async def export_performance_metrics(file_path: str = "performance_metrics.json"):
+    """성능 메트릭 내보내기"""
+    try:
+        performance_monitor.export_metrics(file_path)
+        return {"message": f"성능 메트릭이 {file_path}에 저장되었습니다."}
+    except Exception as e:
+        logger.error(f"메트릭 내보내기 실패: {e}")
+        raise HTTPException(status_code=500, detail=f"메트릭 내보내기 실패: {str(e)}")
+
+# ============================================================================
+# 이미지 OCR 전용 API 엔드포인트 (기존 텍스트 기반 시스템과 완전히 분리)
+# ============================================================================
+
+@app.post("/image-ocr/upload", response_model=DocumentUploadResponse)
+async def upload_image_document(
+    file: UploadFile = File(...),
+    user_id: str = Form("default_user"),
+    document_id: Optional[str] = Form(None)
+):
+    """이미지 OCR 전용 문서 업로드"""
+    start_time = time.time()
+    
+    try:
+        # 문서 ID 생성 (제공되지 않은 경우)
+        if not document_id:
+            import uuid
+            document_id = str(uuid.uuid4())[:8]
+        
+        # 임시 파일 저장
+        temp_file_path = f"./temp_image_{document_id}_{file.filename}"
+        with open(temp_file_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        # 이미지 OCR 처리 및 벡터 스토어에 저장
+        result = image_rag_processor.process_and_store_image_document(
+            user_id, document_id, temp_file_path
+        )
+        
+        # 임시 파일 삭제
+        import os
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+        
+        processing_time = time.time() - start_time
+        logger.info(f"🖼️ 이미지 OCR 문서 업로드 완료 ({processing_time:.2f}초): {file.filename}")
+        
+        return DocumentUploadResponse(
+            success=result["success"],
+            document_id=document_id,
+            message=result.get("message", ""),
+            chunks=result.get("chunks"),
+            latex_count=result.get("latex_count"),
+            error=result.get("error"),
+            auto_response=result.get("auto_response", "")
+        )
+        
+    except Exception as e:
+        logger.error(f"❌ 이미지 OCR 문서 업로드 실패: {e}")
+        return DocumentUploadResponse(
+            success=False,
+            document_id=document_id if 'document_id' in locals() else "unknown",
+            message="이미지 OCR 문서 업로드 중 오류가 발생했습니다.",
+            error=str(e)
+        )
+
+@app.post("/image-ocr/generate", response_model=RAGResponse)
+async def generate_image_ocr_response(
+    query: str = Form(...),
+    user_id: str = Form("default_user"),
+    document_id: str = Form(...)
+):
+    """이미지 OCR 기반 RAG 응답 생성"""
+    start_time = time.time()
+    
+    try:
+        # 이미지 OCR RAG 응답 생성
+        result = image_rag_processor.generate_image_rag_response(
+            user_id, document_id, query
+        )
+        
+        processing_time = time.time() - start_time
+        result["processing_time"] = processing_time
+        
+        logger.info(f"🖼️ 이미지 OCR RAG 응답 생성 완료 ({processing_time:.2f}초)")
+        return result
+        
+    except Exception as e:
+        logger.error(f"❌ 이미지 OCR RAG 응답 생성 실패: {e}")
+        return RAGResponse(
+            success=False,
+            response=f"이미지 OCR RAG 응답 생성 중 오류가 발생했습니다: {str(e)}",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=time.time() - start_time
+        )
+
+@app.get("/image-ocr/document/{user_id}/{document_id}")
+async def get_image_document_info(user_id: str, document_id: str):
+    """이미지 OCR 문서 정보 조회"""
+    try:
+        result = image_rag_processor.get_image_document_info(user_id, document_id)
+        return result
+    except Exception as e:
+        logger.error(f"❌ 이미지 OCR 문서 정보 조회 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.delete("/image-ocr/document/{user_id}/{document_id}")
+async def delete_image_document(user_id: str, document_id: str):
+    """이미지 OCR 문서 삭제"""
+    try:
+        # 벡터 스토어에서 문서 삭제
+        success = vector_store_manager.delete_document(user_id, document_id)
+        
+        if success:
+            return {
+                "success": True,
+                "message": "이미지 OCR 문서가 삭제되었습니다."
+            }
+        else:
+            return {
+                "success": False,
+                "error": "이미지 OCR 문서 삭제에 실패했습니다."
+            }
+    except Exception as e:
+        logger.error(f"❌ 이미지 OCR 문서 삭제 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+# ============================================================================
+# LaTeX-OCR 전용 API 엔드포인트 (수학 수식 인식 기능 포함)
+# ============================================================================
+
+@app.post("/latex-ocr/upload", response_model=DocumentUploadResponse)
+async def upload_latex_document(
+    file: UploadFile = File(...),
+    user_id: str = Form("default_user"),
+    document_id: Optional[str] = Form(None)
+):
+    """LaTeX-OCR 전용 문서 업로드"""
+    start_time = time.time()
+    
+    try:
+        # 문서 ID 생성 (제공되지 않은 경우)
+        if not document_id:
+            import uuid
+            document_id = str(uuid.uuid4())[:8]
+        
+        # 임시 파일 저장
+        temp_file_path = f"./temp_latex_{document_id}_{file.filename}"
+        with open(temp_file_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        # LaTeX-OCR 처리 및 벡터 스토어에 저장
+        result = latex_rag_processor.process_and_store_latex_document(
+            user_id, document_id, temp_file_path
+        )
+        
+        # 임시 파일 삭제
+        import os
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+        
+        processing_time = time.time() - start_time
+        logger.info(f"🧮 LaTeX-OCR 문서 업로드 완료 ({processing_time:.2f}초): {file.filename}")
+        
+        return DocumentUploadResponse(
+            success=result["success"],
+            document_id=document_id,
+            message=result.get("message", ""),
+            chunks=result.get("chunks"),
+            latex_count=result.get("latex_count"),
+            error=result.get("error"),
+            auto_response=result.get("auto_response", "")
+        )
+        
+    except Exception as e:
+        logger.error(f"❌ LaTeX-OCR 문서 업로드 실패: {e}")
+        return DocumentUploadResponse(
+            success=False,
+            document_id=document_id if 'document_id' in locals() else "unknown",
+            message="LaTeX-OCR 문서 업로드 중 오류가 발생했습니다.",
+            error=str(e)
+        )
+
+@app.post("/latex-ocr/generate", response_model=RAGResponse)
+async def generate_latex_ocr_response(
+    query: str = Form(...),
+    user_id: str = Form("default_user"),
+    document_id: str = Form(...)
+):
+    """LaTeX-OCR 기반 RAG 응답 생성"""
+    start_time = time.time()
+    
+    try:
+        # LaTeX-OCR RAG 응답 생성
+        result = latex_rag_processor.generate_latex_rag_response(
+            user_id, document_id, query
+        )
+        
+        processing_time = time.time() - start_time
+        result["processing_time"] = processing_time
+        
+        logger.info(f"🧮 LaTeX-OCR RAG 응답 생성 완료 ({processing_time:.2f}초)")
+        return result
+        
+    except Exception as e:
+        logger.error(f"❌ LaTeX-OCR RAG 응답 생성 실패: {e}")
+        return RAGResponse(
+            success=False,
+            response=f"LaTeX-OCR RAG 응답 생성 중 오류가 발생했습니다: {str(e)}",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=time.time() - start_time
+        )
+
+@app.get("/latex-ocr/document/{user_id}/{document_id}")
+async def get_latex_document_info(user_id: str, document_id: str):
+    """LaTeX-OCR 문서 정보 조회"""
+    try:
+        result = latex_rag_processor.get_latex_document_info(user_id, document_id)
+        return result
+    except Exception as e:
+        logger.error(f"❌ LaTeX-OCR 문서 정보 조회 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+@app.delete("/latex-ocr/document/{user_id}/{document_id}")
+async def delete_latex_document(user_id: str, document_id: str):
+    """LaTeX-OCR 문서 삭제"""
+    try:
+        # 벡터 스토어에서 문서 삭제
+        success = vector_store_manager.delete_document(user_id, document_id)
+        
+        if success:
+            return {
+                "success": True,
+                "message": "LaTeX-OCR 문서가 삭제되었습니다."
+            }
+        else:
+            return {
+                "success": False,
+                "error": "LaTeX-OCR 문서 삭제에 실패했습니다."
+            }
+    except Exception as e:
+        logger.error(f"❌ LaTeX-OCR 문서 삭제 실패: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+# ============================================================================
+# LaTeX-OCR + FAISS 통합 시스템 엔드포인트
+# ============================================================================
+
+# # LaTeX-OCR + FAISS 시스템 초기화
+# latex_ocr_faiss_simple = None
+# latex_ocr_faiss_integrated = None
+
+# def init_latex_ocr_faiss_systems():
+#     """LaTeX-OCR + FAISS 시스템 초기화"""
+#     global latex_ocr_faiss_simple, latex_ocr_faiss_integrated
+#     try:
+#         latex_ocr_faiss_simple = LatexOCRFAISSSimple()
+#         latex_ocr_faiss_integrated = LatexOCRFAISSIntegrated()
+#         logger.info("✅ LaTeX-OCR + FAISS 시스템 초기화 완료")
+#     except Exception as e:
+#         logger.error(f"❌ LaTeX-OCR + FAISS 시스템 초기화 실패: {e}")
+
+@app.post("/latex-ocr-faiss/process", response_model=DocumentUploadResponse)
+async def process_pdf_with_latex_faiss(
+    file: UploadFile = File(...),
+    user_id: str = Form("default_user"),
+    system_type: str = Form("simple")  # "simple" 또는 "integrated"
+):
+    """PDF에서 LaTeX 수식 추출 및 FAISS 저장"""
+    try:
+        # 파일 저장
+        upload_dir = Path("uploads/latex_ocr_faiss")
+        upload_dir.mkdir(parents=True, exist_ok=True)
+        
+        file_path = upload_dir / f"{user_id}_{file.filename}"
+        with open(file_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        # 시스템 선택 (주석 처리됨 - 삭제된 모듈)
+        # if system_type == "simple":
+        #     if not latex_ocr_faiss_simple:
+        #         init_latex_ocr_faiss_systems()
+        #     system = latex_ocr_faiss_simple
+        # else:
+        #     if not latex_ocr_faiss_integrated:
+        #         init_latex_ocr_faiss_systems()
+        #     system = latex_ocr_faiss_integrated
+        
+        # 임시 처리 - 기능 비활성화
+        return DocumentUploadResponse(
+            success=False,
+            document_id="",
+            message="LaTeX-OCR + FAISS 기능이 현재 비활성화되어 있습니다",
+            error="삭제된 모듈로 인해 기능이 비활성화됨"
+        )
+        
+        # PDF 처리 (주석 처리됨)
+        # result = system.process_pdf_with_latex(str(file_path), user_id)
+        
+        if result["success"]:
+            return DocumentUploadResponse(
+                success=True,
+                document_id=f"latex_ocr_faiss_{user_id}_{file.filename}",
+                message=f"LaTeX 수식 {result['latex_count']}개 추출 완료",
+                chunks=result['latex_count'],
+                latex_count=result['latex_count']
+            )
+        else:
+            return DocumentUploadResponse(
+                success=False,
+                document_id="",
+                message="LaTeX 수식 추출 실패",
+                error=result.get("error", "LaTeX 수식 추출 실패")
+            )
+            
+    except Exception as e:
+        logger.error(f"LaTeX-OCR + FAISS 처리 오류: {e}")
+        return DocumentUploadResponse(
+            success=False,
+            document_id="",
+            message="처리 중 오류가 발생했습니다",
+            error=f"처리 중 오류가 발생했습니다: {str(e)}"
+        )
+
+@app.post("/latex-ocr-faiss/search", response_model=RAGResponse)
+async def search_latex_formulas(
+    query: str = Form(...),
+    user_id: str = Form("default_user"),
+    document_path: Optional[str] = Form(None),
+    system_type: str = Form("simple"),
+    k: int = Form(5)
+):
+    """저장된 LaTeX 수식 검색"""
+    try:
+        # 시스템 선택 (주석 처리됨 - 삭제된 모듈)
+        # if system_type == "simple":
+        #     if not latex_ocr_faiss_simple:
+        #         init_latex_ocr_faiss_systems()
+        #     system = latex_ocr_faiss_simple
+        # else:
+        #     if not latex_ocr_faiss_integrated:
+        #         init_latex_ocr_faiss_systems()
+        #     system = latex_ocr_faiss_integrated
+        
+        # 임시 처리 - 기능 비활성화
+        return RAGResponse(
+            success=False,
+            response="LaTeX-OCR + FAISS 검색 기능이 현재 비활성화되어 있습니다",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=0.0,
+            error="삭제된 모듈로 인해 기능이 비활성화됨"
+        )
+        
+        # 수식 검색 (주석 처리됨)
+        # search_result = system.search_formulas(query, user_id, document_path, k)
+        
+        if search_result["success"]:
+            # 검색 결과를 응답 형식으로 변환
+            context = "\n".join([f"수식: {result['formula']} (유사도: {result['similarity']:.3f})" 
+                                for result in search_result['results']])
+            
+            sources = [{"formula": result['formula'], "similarity": result['similarity'], 
+                       "page": result.get('page', 1)} for result in search_result['results']]
+            
+            return RAGResponse(
+                success=True,
+                response=f"검색된 수식 {search_result['search_results']}개를 찾았습니다.",
+                context=context,
+                sources=sources,
+                search_results=search_result['search_results'],
+                processing_time=0.0  # 실제 처리 시간 측정 필요
+            )
+        else:
+            return RAGResponse(
+                success=False,
+                response="수식 검색에 실패했습니다.",
+                context="",
+                sources=[],
+                search_results=0,
+                processing_time=0.0,
+                error=search_result.get("error", "검색 실패")
+            )
+            
+    except Exception as e:
+        logger.error(f"LaTeX 수식 검색 오류: {e}")
+        return RAGResponse(
+            success=False,
+            response="검색 중 오류가 발생했습니다.",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=0.0,
+            error=str(e)
+        )
+
+@app.get("/latex-ocr-faiss/status")
+async def get_latex_ocr_faiss_status():
+    """LaTeX-OCR + FAISS 시스템 상태 확인"""
+    try:
+        # 주석 처리됨 - 삭제된 모듈
+        # simple_status = latex_ocr_faiss_simple is not None
+        # integrated_status = latex_ocr_faiss_integrated is not None
+        # return {
+        #     "simple_system_initialized": simple_status,
+        #     "integrated_system_initialized": integrated_status,
+        #     "status": "ready" if (simple_status or integrated_status) else "not_initialized"
+        # }
+        
+        return {
+            "simple_system_initialized": False,
+            "integrated_system_initialized": False,
+            "status": "disabled",
+            "message": "LaTeX-OCR + FAISS 기능이 현재 비활성화되어 있습니다"
+        }
+    except Exception as e:
+        logger.error(f"상태 확인 오류: {e}")
+        return {"status": "error", "error": str(e)}
+
+# ============================================================================
+# 컨텍스트 관리 시스템 엔드포인트
+# ============================================================================
+
+@app.post("/context/set-system-prompt")
+async def set_system_prompt(prompt: str = Form(...)):
+    """시스템 프롬프트 설정"""
+    try:
+        context_manager.set_system_prompt(prompt)
+        return {
+            "success": True,
+            "message": "시스템 프롬프트가 설정되었습니다.",
+            "prompt_length": len(prompt)
+        }
+    except Exception as e:
+        logger.error(f"❌ 시스템 프롬프트 설정 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/context/add-message")
+async def add_context_message(
+    role: str = Form(...),  # 'user' 또는 'assistant'
+    content: str = Form(...),
+    message_id: str = Form(None),
+    metadata: str = Form("{}")  # JSON 문자열
+):
+    """컨텍스트에 메시지 추가"""
+    try:
+        import json
+        metadata_dict = json.loads(metadata) if metadata else {}
+        
+        if role == "user":
+            msg_id = context_manager.add_user_message(content, message_id, metadata_dict)
+        elif role == "assistant":
+            msg_id = context_manager.add_assistant_message(content, message_id, metadata_dict)
+        else:
+            return {"success": False, "error": "잘못된 역할입니다. 'user' 또는 'assistant'를 사용하세요."}
+        
+        return {
+            "success": True,
+            "message": "메시지가 컨텍스트에 추가되었습니다.",
+            "message_id": msg_id,
+            "context_summary": context_manager.get_context_summary()
+        }
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 메시지 추가 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.get("/context/get")
+async def get_context(
+    include_system: bool = True,
+    max_length: Optional[int] = None,
+    recent_turns: Optional[int] = None
+):
+    """현재 컨텍스트 조회"""
+    try:
+        if recent_turns:
+            context = context_manager.get_recent_context(recent_turns)
+        else:
+            context = context_manager.get_context(include_system, max_length)
+        
+        return {
+            "success": True,
+            "context": context,
+            "context_summary": context_manager.get_context_summary(),
+            "memory_efficiency": context_manager.get_memory_efficiency()
+        }
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 조회 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.get("/context/summary")
+async def get_context_summary():
+    """컨텍스트 요약 정보 조회"""
+    try:
+        return {
+            "success": True,
+            "summary": context_manager.get_context_summary(),
+            "memory_efficiency": context_manager.get_memory_efficiency()
+        }
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 요약 조회 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/context/clear")
+async def clear_context():
+    """컨텍스트 초기화"""
+    try:
+        context_manager.clear_context()
+        return {
+            "success": True,
+            "message": "컨텍스트가 초기화되었습니다."
+        }
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 초기화 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.delete("/context/message/{message_id}")
+async def remove_context_message(message_id: str):
+    """컨텍스트에서 특정 메시지 제거"""
+    try:
+        success = context_manager.remove_message(message_id)
+        if success:
+            return {
+                "success": True,
+                "message": "메시지가 제거되었습니다.",
+                "context_summary": context_manager.get_context_summary()
+            }
+        else:
+            return {"success": False, "error": "메시지를 찾을 수 없습니다."}
+    except Exception as e:
+        logger.error(f"❌ 메시지 제거 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.put("/context/message/{message_id}")
+async def edit_context_message(
+    message_id: str,
+    new_content: str = Form(...)
+):
+    """컨텍스트 메시지 수정"""
+    try:
+        success = context_manager.edit_message(message_id, new_content)
+        if success:
+            return {
+                "success": True,
+                "message": "메시지가 수정되었습니다.",
+                "context_summary": context_manager.get_context_summary()
+            }
+        else:
+            return {"success": False, "error": "메시지를 찾을 수 없습니다."}
+    except Exception as e:
+        logger.error(f"❌ 메시지 수정 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.get("/context/search")
+async def search_context(query: str, max_results: int = 5):
+    """컨텍스트 내에서 검색"""
+    try:
+        results = context_manager.search_context(query, max_results)
+        return {
+            "success": True,
+            "query": query,
+            "results": results,
+            "total_results": len(results)
+        }
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 검색 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/context/export")
+async def export_context(file_path: str = Form(None)):
+    """컨텍스트 내보내기"""
+    try:
+        exported_path = context_manager.export_context(file_path)
+        return {
+            "success": True,
+            "message": "컨텍스트가 내보내졌습니다.",
+            "file_path": exported_path
+        }
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 내보내기 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/context/import")
+async def import_context(file_path: str = Form(...)):
+    """컨텍스트 가져오기"""
+    try:
+        success = context_manager.import_context(file_path)
+        if success:
+            return {
+                "success": True,
+                "message": "컨텍스트가 가져와졌습니다.",
+                "context_summary": context_manager.get_context_summary()
+            }
+        else:
+            return {"success": False, "error": "컨텍스트 가져오기에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ 컨텍스트 가져오기 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+# ============================================================================
+# LoRA/QLoRA 관리 시스템 엔드포인트
+# ============================================================================
+
+@app.post("/lora/load-base-model")
+async def load_lora_base_model(
+    model_path: str = Form(...),
+    model_type: str = Form("causal_lm")
+):
+    """LoRA 기본 모델 로드"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.load_base_model(model_path, model_type)
+        if success:
+            return {
+                "success": True,
+                "message": "기본 모델이 로드되었습니다.",
+                "model_path": model_path,
+                "device": lora_manager.device
+            }
+        else:
+            return {"success": False, "error": "모델 로드에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 기본 모델 로드 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/create-config")
+async def create_lora_config(
+    r: int = Form(16),
+    lora_alpha: int = Form(32),
+    target_modules: str = Form("q_proj,v_proj,k_proj,o_proj,gate_proj,up_proj,down_proj"),
+    lora_dropout: float = Form(0.1),
+    bias: str = Form("none"),
+    task_type: str = Form("CAUSAL_LM")
+):
+    """LoRA 설정 생성"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        # target_modules를 리스트로 변환
+        target_modules_list = target_modules.split(",") if target_modules else None
+        
+        config = lora_manager.create_lora_config(
+            r=r,
+            lora_alpha=lora_alpha,
+            target_modules=target_modules_list,
+            lora_dropout=lora_dropout,
+            bias=bias,
+            task_type=task_type
+        )
+        
+        return {
+            "success": True,
+            "message": "LoRA 설정이 생성되었습니다.",
+            "config": config.to_dict()
+        }
+    except Exception as e:
+        logger.error(f"❌ LoRA 설정 생성 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/apply")
+async def apply_lora_adapter(adapter_name: str = Form("default")):
+    """LoRA 어댑터를 모델에 적용"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.apply_lora_to_model(adapter_name)
+        if success:
+            return {
+                "success": True,
+                "message": "LoRA 어댑터가 적용되었습니다.",
+                "adapter_name": adapter_name,
+                "stats": lora_manager.get_adapter_stats()
+            }
+        else:
+            return {"success": False, "error": "LoRA 어댑터 적용에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 적용 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/load-adapter")
+async def load_lora_adapter(
+    adapter_path: str = Form(...),
+    adapter_name: str = Form(None)
+):
+    """저장된 LoRA 어댑터 로드"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.load_lora_adapter(adapter_path, adapter_name)
+        if success:
+            return {
+                "success": True,
+                "message": "LoRA 어댑터가 로드되었습니다.",
+                "adapter_name": lora_manager.current_adapter_name,
+                "stats": lora_manager.get_adapter_stats()
+            }
+        else:
+            return {"success": False, "error": "LoRA 어댑터 로드에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 로드 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/save-adapter")
+async def save_lora_adapter(
+    adapter_name: str = Form(None),
+    output_dir: str = Form(None)
+):
+    """LoRA 어댑터 저장"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.save_lora_adapter(adapter_name, output_dir)
+        if success:
+            return {
+                "success": True,
+                "message": "LoRA 어댑터가 저장되었습니다.",
+                "adapter_name": lora_manager.current_adapter_name
+            }
+        else:
+            return {"success": False, "error": "LoRA 어댑터 저장에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 저장 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.get("/lora/adapters")
+async def list_lora_adapters():
+    """사용 가능한 LoRA 어댑터 목록"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        adapters = lora_manager.list_available_adapters()
+        return {
+            "success": True,
+            "adapters": adapters
+        }
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 목록 조회 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.get("/lora/stats")
+async def get_lora_stats():
+    """현재 LoRA 어댑터 통계"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        stats = lora_manager.get_adapter_stats()
+        return {
+            "success": True,
+            "stats": stats
+        }
+    except Exception as e:
+        logger.error(f"❌ LoRA 통계 조회 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/switch")
+async def switch_lora_adapter(adapter_name: str = Form(...)):
+    """LoRA 어댑터 전환"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.switch_adapter(adapter_name)
+        if success:
+            return {
+                "success": True,
+                "message": f"LoRA 어댑터가 {adapter_name}으로 전환되었습니다.",
+                "adapter_name": adapter_name,
+                "stats": lora_manager.get_adapter_stats()
+            }
+        else:
+            return {"success": False, "error": "LoRA 어댑터 전환에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 전환 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/unload")
+async def unload_lora_adapter():
+    """LoRA 어댑터 언로드"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.unload_adapter()
+        if success:
+            return {
+                "success": True,
+                "message": "LoRA 어댑터가 언로드되었습니다."
+            }
+        else:
+            return {"success": False, "error": "LoRA 어댑터 언로드에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 언로드 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/generate")
+async def generate_with_lora(
+    prompt: str = Form(...),
+    max_length: int = Form(100),
+    temperature: float = Form(0.7)
+):
+    """LoRA 모델을 사용한 텍스트 생성"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        response = lora_manager.generate_text(prompt, max_length, temperature)
+        return {
+            "success": True,
+            "response": response,
+            "adapter_name": lora_manager.current_adapter_name
+        }
+    except Exception as e:
+        logger.error(f"❌ LoRA 텍스트 생성 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.post("/lora/merge")
+async def merge_lora_with_base(output_path: str = Form(None)):
+    """LoRA 어댑터를 기본 모델과 병합"""
+    if not LORA_AVAILABLE or lora_manager is None:
+        return {
+            "success": False, 
+            "error": "LoRA 기능이 사용 불가능합니다. PEFT 라이브러리가 설치되지 않았습니다."
+        }
+    
+    try:
+        success = lora_manager.merge_lora_with_base(output_path)
+        if success:
+            return {
+                "success": True,
+                "message": "LoRA 어댑터가 기본 모델과 병합되었습니다.",
+                "output_path": output_path or f"{lora_manager.base_model_path}_merged"
+            }
+        else:
+            return {"success": False, "error": "LoRA 어댑터 병합에 실패했습니다."}
+    except Exception as e:
+        logger.error(f"❌ LoRA 어댑터 병합 실패: {e}")
+        return {"success": False, "error": str(e)}
+
+# ============================================================================
+# 멀티모달 RAG 시스템 엔드포인트
+# ============================================================================
+
+@app.post("/hybrid-rag/upload", response_model=DocumentUploadResponse)
+async def upload_hybrid_document(
+    file: UploadFile = File(...),
+    user_id: str = Form("default_user"),
+    document_id: Optional[str] = Form(None)
+):
+    """멀티모달 RAG 문서 업로드"""
+    try:
+        # 파일 저장
+        upload_dir = Path("uploads/hybrid_rag")
+        upload_dir.mkdir(parents=True, exist_ok=True)
+        
+        if not document_id:
+            document_id = f"{user_id}_{int(time.time())}_{file.filename}"
+        
+        file_path = upload_dir / document_id
+        with open(file_path, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        
+        # 멀티모달 처리
+        result = hybrid_rag_processor.process_document_hybrid(str(file_path), user_id, document_id)
+        
+        if result["success"]:
+            # 성공한 시스템 수 계산
+            success_systems = []
+            for key, value in result.items():
+                if key.endswith('_processing') and value and value.get('success', False):
+                    system_name = key.replace('_processing', '').replace('_', ' ').title()
+                    success_systems.append(system_name)
+            
+            return DocumentUploadResponse(
+                success=True,
+                document_id=document_id,
+                message=f"멀티모달 처리 완료: {', '.join(success_systems)} 시스템에서 처리됨",
+                chunks=len(success_systems)
+            )
+        else:
+            return DocumentUploadResponse(
+                success=False,
+                error=result.get("error", "멀티모달 처리 실패")
+            )
+            
+    except Exception as e:
+        logger.error(f"멀티모달 RAG 문서 업로드 오류: {e}")
+        return DocumentUploadResponse(
+            success=False,
+            error=f"업로드 중 오류가 발생했습니다: {str(e)}"
+        )
+
+@app.post("/hybrid-rag/generate", response_model=RAGResponse)
+async def generate_hybrid_rag_response(
+    query: str = Form(...),
+    user_id: str = Form("default_user"),
+    document_id: str = Form(...),
+    use_text: bool = Form(True),
+    use_image: bool = Form(True),
+    use_latex: bool = Form(True),
+    use_latex_ocr: bool = Form(False),  # LaTeX-OCR 기능이 비활성화됨
+    max_length: Optional[int] = Form(None),
+    temperature: Optional[float] = Form(None),
+    top_p: Optional[float] = Form(None),
+    do_sample: Optional[bool] = Form(None)
+):
+    """멀티모달 RAG 응답 생성"""
+    try:
+        result = hybrid_rag_processor.generate_hybrid_response(
+            query, user_id, document_id,
+            use_text, use_image, use_latex, use_latex_ocr,
+            max_length, temperature, top_p, do_sample
+        )
+        
+        return RAGResponse(
+            success=result["success"],
+            response=result["response"],
+            context=result["context"],
+            sources=result["sources"],
+            search_results=result["search_results"],
+            processing_time=result["processing_time"]
+        )
+        
+    except Exception as e:
+        logger.error(f"멀티모달 RAG 응답 생성 오류: {e}")
+        return RAGResponse(
+            success=False,
+            response=f"멀티모달 RAG 응답 생성 중 오류가 발생했습니다: {str(e)}",
+            context="",
+            sources=[],
+            search_results=0,
+            processing_time=0.0
+        )
+
+@app.get("/hybrid-rag/document/{user_id}/{document_id}")
+async def get_hybrid_document_info(user_id: str, document_id: str):
+    """멀티모달 RAG 문서 정보 조회"""
+    try:
+        result = hybrid_rag_processor.get_document_info(user_id, document_id)
+        return result
+    except Exception as e:
+        logger.error(f"멀티모달 RAG 문서 정보 조회 오류: {e}")
+        return {"success": False, "error": str(e)}
+
+@app.get("/hybrid-rag/status")
+async def get_hybrid_rag_status():
+    """멀티모달 RAG 시스템 상태 확인"""
+    try:
+        return {
+            "text_rag_available": True,
+            "image_rag_available": True,
+            "latex_rag_available": True,
+            "latex_ocr_faiss_available": False,  # LaTeX-OCR 기능이 비활성화됨
+            "status": "ready"
+        }
+    except Exception as e:
+        logger.error(f"멀티모달 RAG 상태 확인 오류: {e}")
+        return {"status": "error", "error": str(e)}
+
+# ============================================================================
+# 🔄 RAG 시스템과 고급 컨텍스트 관리자 통합 API
+# ============================================================================
+
+@app.post("/rag/context-integrated/query")
+async def rag_query_with_context_integration(
+    user_id: str = Form(...),
+    document_id: str = Form(...),
+    query: str = Form(...),
+    session_id: str = Form(...),
+    max_results: int = Form(5),
+    enable_context_integration: bool = Form(True)
+):
+    """RAG 쿼리 + 컨텍스트 통합 - 고급 컨텍스트 관리자와 연동"""
+    try:
+        logger.info(f"🔍 RAG + 컨텍스트 통합 쿼리 시작: 사용자 {user_id}, 문서 {document_id}, 세션 {session_id}")
+        
+        # 컨텍스트 관리자 확인
+        if not context_manager:
+            return {"status": "error", "message": "컨텍스트 관리자를 사용할 수 없습니다."}
+        
+        # RAG 응답 생성 (컨텍스트 통합 활성화)
+        rag_result = rag_processor.generate_rag_response(
+            user_id=user_id,
+            document_id=document_id,
+            query=query,
+            session_id=session_id if enable_context_integration else None,
+            context_manager=context_manager if enable_context_integration else None
+        )
+        
+        if not rag_result["success"]:
+            return rag_result
+        
+        # 컨텍스트에 RAG 결과 통합
+        if enable_context_integration:
+            try:
+                # RAG 검색 결과를 컨텍스��에 추가
+                rag_summary = f"RAG 검색 결과: {query}에 대한 {rag_result.get('search_results', 0)}개 관련 문서 발견"
+                
+                # 컨텍스트에 시스템 메시지로 추가
+                context_manager.add_system_message(
+                    rag_summary,
+                    metadata={"session_id": session_id, "type": "rag_integration", "query": query}
+                )
+                
+                logger.info(f"🔄 RAG 결과를 컨텍스트에 통합 완료 (세션: {session_id})")
+                
+            except Exception as e:
+                logger.warning(f"⚠️ 컨텍스트 통합 실패: {e}")
+        
+        # 통합된 결과 반환
+        result = {
+            "status": "success",
+            "rag_response": rag_result,
+            "context_integration": enable_context_integration,
+            "session_id": session_id,
+            "context_summary": context_manager.get_context_summary(session_id) if enable_context_integration else None
+        }
+        
+        logger.info(f"✅ RAG + 컨텍스트 통합 쿼리 완료")
+        return result
+        
+    except Exception as e:
+        logger.error(f"❌ RAG + 컨텍스트 통합 쿼리 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+@app.get("/rag/context-integrated/summary/{session_id}")
+async def get_rag_context_summary(session_id: str):
+    """RAG 통합 컨텍스트 요약 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "컨텍스트 관리자를 사용할 수 없습니다."}
+        
+        # 컨텍스트 요약 정보
+        context_summary = context_manager.get_context_summary(session_id)
+        
+        # RAG 관련 정보 추출
+        rag_contexts = []
+        if session_id in context_manager.session_conversations:
+            for turn in context_manager.session_conversations[session_id]:
+                if (hasattr(turn, 'metadata') and turn.metadata and 
+                    turn.metadata.get('type') == 'rag_integration'):
+                    rag_contexts.append({
+                        "query": turn.metadata.get('query', ''),
+                        "content": turn.content,
+                        "timestamp": turn.timestamp
+                    })
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "context_summary": context_summary,
+            "rag_contexts": rag_contexts,
+            "rag_context_count": len(rag_contexts)
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 컨텍스트 요약 조회 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+@app.post("/rag/context-integrated/clear/{session_id}")
+async def clear_rag_context(session_id: str):
+    """RAG 통합 컨텍스트 정리"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "컨텍스트 관리자를 사용할 수 없습니다."}
+        
+        # RAG 관련 컨텍스트만 제거
+        if session_id in context_manager.session_conversations:
+            conversation_history = context_manager.session_conversations[session_id]
+            rag_turns = []
+            
+            for turn in conversation_history:
+                if (hasattr(turn, 'metadata') and turn.metadata and 
+                    turn.metadata.get('type') == 'rag_integration'):
+                    rag_turns.append(turn)
+            
+            # RAG 관련 턴 제거
+            for turn in rag_turns:
+                context_manager.remove_message(turn.message_id, session_id)
+            
+            logger.info(f"🗑️ RAG 컨텍스트 정리 완료: {len(rag_turns)}개 턴 제거 (세션: {session_id})")
+            
+            return {
+                "status": "success",
+                "session_id": session_id,
+                "removed_rag_turns": len(rag_turns),
+                "message": f"RAG 컨텍스트 {len(rag_turns)}개 턴이 제거되었습니다."
+            }
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "removed_rag_turns": 0,
+            "message": "제거할 RAG 컨텍스트가 없습니다."
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 컨텍스트 정리 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+@app.get("/rag/performance/stats")
+async def get_rag_performance_stats():
+    """RAG 시스템 성능 통계 조회"""
+    try:
+        # RAG 프로세서 성능 통계
+        rag_stats = rag_processor.get_performance_stats()
+        
+        # 벡터 스토어 성능 통계
+        vector_stats = vector_store_manager.get_performance_stats()
+        
+        # 통합 성능 통계
+        combined_stats = {
+            "rag_processor": rag_stats,
+            "vector_store": vector_stats,
+            "overall": {
+                "total_operations": rag_stats.get("total_requests", 0) + vector_stats.get("total_operations", 0),
+                "success_rate": (rag_stats.get("success_rate", 0.0) + vector_stats.get("success_rate", 0.0)) / 2,
+                "avg_processing_time": (rag_stats.get("avg_processing_time", 0.0) + vector_stats.get("avg_operation_time", 0.0)) / 2
+            },
+            "timestamp": time.time()
+        }
+        
+        return {
+            "status": "success",
+            "performance_stats": combined_stats
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 성능 통계 조회 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+@app.post("/rag/performance/reset")
+async def reset_rag_performance_stats():
+    """RAG 시스템 성능 통계 초기화"""
+    try:
+        # RAG 프로세서 통계 초기화
+        rag_processor.reset_stats()
+        
+        # 벡터 스토어 통계 초기화
+        vector_store_manager.reset_stats()
+        
+        logger.info("🔄 RAG 시스템 성능 통계 초기화 완료")
+        
+        return {
+            "status": "success",
+            "message": "RAG 시스템 성능 통계가 초기화되었습니다."
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 성능 통계 초기화 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+@app.get("/rag/health/check")
+async def rag_health_check():
+    """RAG 시스템 건강 상태 확인"""
+    try:
+        # RAG 프로세서 상태
+        rag_status = {
+            "rag_processor": "healthy",
+            "enable_context_integration": rag_processor.enable_context_integration,
+            "max_context_length": rag_processor.max_context_length,
+            "max_search_results": rag_processor.max_search_results
+        }
+        
+        # 벡터 스토어 상태
+        vector_status = vector_store_manager.health_check()
+        
+        # 문서 프로세서 상태
+        doc_processor_status = {
+            "status": "healthy",
+            "supported_formats": document_processor.supported_formats if hasattr(document_processor, 'supported_formats') else [],
+            "ocr_available": hasattr(document_processor, 'ocr_reader') and document_processor.ocr_reader is not None
+        }
+        
+        # 통합 상태
+        overall_status = "healthy"
+        if vector_status.get("status") != "healthy":
+            overall_status = "degraded"
+        
+        return {
+            "status": "success",
+            "overall_status": overall_status,
+            "rag_processor": rag_status,
+            "vector_store": vector_status,
+            "document_processor": doc_processor_status,
+            "timestamp": time.time()
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 시스템 건강 상태 확인 실패: {e}")
+        return {
+            "status": "error",
+            "overall_status": "unhealthy",
+            "error": str(e),
+            "timestamp": time.time()
+        }
+
+@app.post("/rag/context-integrated/batch-process")
+async def batch_process_with_context_integration(
+    user_id: str = Form(...),
+    session_id: str = Form(...),
+    documents: List[UploadFile] = File(...),
+    enable_context_integration: bool = Form(True)
+):
+    """배치 문서 처리 + 컨텍스트 통합"""
+    try:
+        logger.info(f"📚 배치 문서 처리 + 컨텍스트 통합 시작: 사용자 {user_id}, 세션 {session_id}, 문서 {len(documents)}개")
+        
+        results = []
+        
+        for i, doc in enumerate(documents):
+            try:
+                # 임시 파일로 저장
+                temp_path = f"./temp_{user_id}_{session_id}_{i}_{int(time.time())}"
+                with open(temp_path, "wb") as f:
+                    f.write(doc.file.read())
+                
+                # 문서 ID 생성
+                document_id = f"batch_{session_id}_{i}_{int(time.time())}"
+                
+                # RAG 처리
+                rag_result = rag_processor.process_and_store_document(
+                    user_id=user_id,
+                    document_id=document_id,
+                    file_path=temp_path
+                )
+                
+                # 컨텍스트 통합
+                if enable_context_integration and rag_result["success"]:
+                    try:
+                        context_manager.add_system_message(
+                            f"배치 문서 처리 완료: {doc.filename} ({rag_result.get('chunks', 0)}개 청크)",
+                            metadata={"session_id": session_id, "type": "batch_rag", "filename": doc.filename}
+                        )
+                    except Exception as e:
+                        logger.warning(f"⚠️ 컨텍스트 통합 실패: {e}")
+                
+                # 임시 파일 정리
+                try:
+                    os.remove(temp_path)
+                except:
+                    pass
+                
+                results.append({
+                    "filename": doc.filename,
+                    "document_id": document_id,
+                    "rag_result": rag_result,
+                    "context_integration": enable_context_integration
+                })
+                
+            except Exception as e:
+                logger.error(f"❌ 문서 {doc.filename} 처리 실패: {e}")
+                results.append({
+                    "filename": doc.filename,
+                    "error": str(e),
+                    "context_integration": enable_context_integration
+                })
+        
+        # 성공/실패 통계
+        success_count = sum(1 for r in results if r.get("rag_result", {}).get("success", False))
+        error_count = len(results) - success_count
+        
+        logger.info(f"✅ 배치 문서 처리 완료: {success_count}개 성공, {error_count}개 실패")
+        
+        return {
+            "status": "success",
+            "user_id": user_id,
+            "session_id": session_id,
+            "total_documents": len(documents),
+            "success_count": success_count,
+            "error_count": error_count,
+            "results": results,
+            "context_integration": enable_context_integration
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ 배치 문서 처리 + 컨텍스트 통합 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+@app.get("/rag/context-integrated/search-history/{session_id}")
+async def get_rag_search_history(session_id: str, limit: int = 10):
+    """RAG 검색 히스토리 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "컨텍스트 관리자를 사용할 수 없습니다."}
+        
+        # RAG 관련 검색 히스토리 추출
+        search_history = []
+        if session_id in context_manager.session_conversations:
+            for turn in context_manager.session_conversations[session_id]:
+                if (hasattr(turn, 'metadata') and turn.metadata and 
+                    turn.metadata.get('type') in ['rag_integration', 'rag_context', 'batch_rag']):
+                    search_history.append({
+                        "timestamp": turn.timestamp,
+                        "type": turn.metadata.get('type'),
+                        "query": turn.metadata.get('query', ''),
+                        "filename": turn.metadata.get('filename', ''),
+                        "content": turn.content
+                    })
+        
+        # 최근 순으로 정렬하고 제한
+        search_history.sort(key=lambda x: x['timestamp'], reverse=True)
+        limited_history = search_history[:limit]
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "search_history": limited_history,
+            "total_count": len(search_history),
+            "limited_count": len(limited_history)
+        }
+        
+    except Exception as e:
+        logger.error(f"❌ RAG 검색 히스토리 조회 실패: {e}")
+        return {"status": "error", "message": str(e)}
+
+# ============================================================================
+# 🔄 실무용 고급 컨텍스트 관리자 API 엔드포인트
+# ============================================================================
+
+@app.get("/context/advanced/summary-method")
+async def get_summary_method():
+    """현재 요약 방법 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        return {
+            "status": "success",
+            "current_method": context_manager.current_summary_method,
+            "available_methods": list(context_manager.summary_models.keys())
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/advanced/summary-method")
+async def set_summary_method(method: str = Form(...)):
+    """요약 방법 설정"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        context_manager.set_summary_method(method)
+        
+        return {
+            "status": "success",
+            "message": f"요약 방법이 {method}로 변경되었습니다",
+            "current_method": context_manager.current_summary_method
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/advanced/summary-stats/{session_id}")
+async def get_advanced_summary_stats(session_id: str):
+    """고급 요약 통계 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        summary_stats = context_manager.get_summary_stats(session_id)
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "summary_stats": summary_stats
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/advanced/compressed/{session_id}")
+async def get_compressed_context(session_id: str, max_tokens: Optional[int] = None):
+    """압축된 컨텍스트 조회 (요약 포함)"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        compressed_context = context_manager.get_compressed_context(session_id, max_tokens)
+        estimated_tokens = context_manager._estimate_tokens(compressed_context)
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "compressed_context": compressed_context,
+            "estimated_tokens": estimated_tokens,
+            "context_length": len(compressed_context)
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/advanced/force-compress/{session_id}")
+async def force_compression(session_id: str):
+    """강제 압축 실행"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        # 압축 전 통계
+        before_stats = context_manager.get_summary_stats(session_id)
+        
+        # 강제 압축 실행
+        context_manager.force_compression(session_id)
+        
+        # 압축 후 통계
+        after_stats = context_manager.get_summary_stats(session_id)
+        
+        return {
+            "status": "success",
+            "message": f"세션 {session_id} 강제 압축 완료",
+            "session_id": session_id,
+            "before_compression": before_stats,
+            "after_compression": after_stats,
+            "compression_effect": {
+                "summary_reduction": before_stats.get("total_summaries", 0) - after_stats.get("total_summaries", 0),
+                "token_reduction": before_stats.get("total_tokens", 0) - after_stats.get("total_tokens", 0)
+            }
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/advanced/turn-summaries/{session_id}")
+async def get_turn_summaries(session_id: str, limit: int = 10):
+    """턴 요약 목록 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        if session_id not in context_manager.turn_summaries:
+            return {
+                "status": "success",
+                "session_id": session_id,
+                "turn_summaries": [],
+                "total_count": 0
+            }
+        
+        summaries = context_manager.turn_summaries[session_id]
+        limited_summaries = summaries[-limit:] if limit > 0 else summaries
+        
+        # TurnSummary 객체를 딕셔너리로 변환
+        summary_data = []
+        for summary in limited_summaries:
+            summary_data.append({
+                "turn_id": summary.turn_id,
+                "user_message": summary.user_message,
+                "assistant_message": summary.assistant_message,
+                "summary": summary.summary,
+                "timestamp": summary.timestamp,
+                "tokens_estimated": summary.tokens_estimated,
+                "key_topics": summary.key_topics
+            })
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "turn_summaries": summary_data,
+            "total_count": len(summaries),
+            "limited_count": len(limited_summaries)
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/advanced/compression-history/{session_id}")
+async def get_compression_history(session_id: str):
+    """압축 히스토리 조회"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        if session_id not in context_manager.compression_history:
+            return {
+                "status": "success",
+                "session_id": session_id,
+                "compression_history": [],
+                "total_compressions": 0
+            }
+        
+        history = context_manager.compression_history[session_id]
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "compression_history": history,
+            "total_compressions": len(history)
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/advanced/optimized/{session_id}")
+async def get_optimized_context(session_id: str, model_name: str = "default"):
+    """모델별 최적화된 컨텍스트 조회 (요약 포함)"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        # 모델별 최적화된 컨텍스트 가져오기
+        optimized_context = context_manager.get_context_for_model(model_name, session_id)
+        estimated_tokens = context_manager._estimate_tokens(optimized_context)
+        
+        # 컨텍스트 요약 정보도 함께 제공
+        context_summary = context_manager.get_context_summary(session_id)
+        summary_stats = context_manager.get_summary_stats(session_id)
+        
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "model_name": model_name,
+            "optimized_context": optimized_context,
+            "estimated_tokens": estimated_tokens,
+            "context_length": len(optimized_context),
+            "context_summary": context_summary,
+            "summary_stats": summary_stats
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/advanced/export-enhanced/{session_id}")
+async def export_enhanced_context(session_id: str, file_path: str = Form(None)):
+    """향상된 컨텍스트 내보내기 (요약 정보 포함)"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        exported_path = context_manager.export_context(file_path, session_id)
+        
+        if exported_path:
+            return {
+                "status": "success",
+                "message": f"세션 {session_id} 향상된 컨텍스트 내보내기 완료",
+                "file_path": exported_path,
+                "session_id": session_id
+            }
+        else:
+            return {"status": "error", "message": "내보내기 실패"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/context/advanced/import-enhanced")
+async def import_enhanced_context(file_path: str = Form(...)):
+    """향상된 컨텍스트 가져오기 (요약 정보 포함)"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        success = context_manager.import_context(file_path)
+        
+        if success:
+            return {
+                "status": "success",
+                "message": "향상된 컨텍스트 가져오기 완료",
+                "file_path": file_path,
+                "context_summary": context_manager.get_context_summary("default")
+            }
+        else:
+            return {"status": "error", "message": "가져오기 실패"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.get("/context/advanced/health-check")
+async def advanced_context_health_check():
+    """고급 컨텍스트 관리자 상태 확인"""
+    try:
+        if not context_manager:
+            return {"status": "error", "message": "Context manager not available"}
+        
+        # 기본 상태 확인
+        basic_status = {
+            "context_manager_available": True,
+            "total_sessions": len(context_manager.session_conversations),
+            "max_tokens": context_manager.max_tokens,
+            "max_turns": context_manager.max_turns,
+            "strategy": context_manager.strategy
+        }
+        
+        # 요약 시스템 상태 확인
+        summary_status = {
+            "summarization_enabled": context_manager.enable_summarization,
+            "current_summary_method": context_manager.current_summary_method,
+            "available_summary_methods": list(context_manager.summary_models.keys()),
+            "summary_threshold": context_manager.summary_threshold,
+            "max_summary_tokens": context_manager.max_summary_tokens
+        }
+        
+        # 자동 정리 상태 확인
+        cleanup_status = context_manager.get_auto_cleanup_config()
+        
+        # 세션별 상세 정보
+        session_details = {}
+        for session_id in context_manager.session_conversations.keys():
+            session_details[session_id] = {
+                "turns": len(context_manager.session_conversations[session_id]),
+                "turn_summaries": len(context_manager.turn_summaries.get(session_id, [])),
+                "compression_history": len(context_manager.compression_history.get(session_id, [])),
+                "context_summary": context_manager.get_context_summary(session_id),
+                "summary_stats": context_manager.get_summary_stats(session_id)
+            }
+        
+        return {
+            "status": "success",
+            "basic_status": basic_status,
+            "summary_status": summary_status,
+            "cleanup_status": cleanup_status,
+            "session_details": session_details,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+"" 
+
+# ============================================================================
+# 사용자 메모리 설정 관리 API
+# ============================================================================
+
+@app.get("/user/memory/settings/{user_id}")
+async def get_user_memory_settings(user_id: str):
+    """사용자 메모��� 설정 조회"""
+    try:
+        from lily_llm_core.user_memory_manager import user_memory_manager
+        
+        # 기본 설정 조회
+        keep_memory = user_memory_manager.get_memory_setting(user_id, "keep_memory_on_room_change")
+        
+        return {
+            "status": "success",
+            "user_id": user_id,
+            "settings": {
+                "keep_memory_on_room_change": keep_memory if keep_memory is not None else True
+            }
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/user/memory/settings/{user_id}")
+async def update_user_memory_settings(
+    user_id: str,
+    keep_memory_on_room_change: bool = Form(True)
+):
+    """사용자 메모리 설정 업데이트"""
+    try:
+        from lily_llm_core.user_memory_manager import user_memory_manager
+        
+        # 설정 업데이트
+        success = user_memory_manager.update_memory_setting(
+            user_id, "keep_memory_on_room_change", keep_memory_on_room_change
+        )
+        
+        if success:
+            return {
+                "status": "success",
+                "message": f"사용자 {user_id} 메모리 설정 업데이트 완료",
+                "settings": {
+                    "keep_memory_on_room_change": keep_memory_on_room_change
+                }
+            }
+        else:
+            return {"status": "error", "message": "설정 업데이트 실패"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+@app.post("/user/memory/room-change/{user_id}")
+async def handle_room_change(user_id: str, new_room_id: str = Form(...)):
+    """Room 변경 시 메모리 처리"""
+    try:
+        from lily_llm_core.user_memory_manager import user_memory_manager
+        from lily_llm_core.integrated_memory_manager import integrated_memory_manager
+        
+        # 사용자 설정 확인
+        keep_memory = user_memory_manager.get_memory_setting(user_id, "keep_memory_on_room_change")
+        
+        if keep_memory:
+            # 메모리 유지 (기본 동작)
+            logger.info(f"🔄 사용자 {user_id}가 room {new_room_id}로 이동 - 메모리 유지")
+            return {
+                "status": "success",
+                "message": f"Room {new_room_id}로 이동 - 메모리 유지됨",
+                "memory_preserved": True
+            }
+        else:
+            # 메모리 초기화
+            logger.info(f"🔄 사용자 {user_id}가 room {new_room_id}로 이동 - 메모리 초기화")
+            
+            # 세션 컨텍스트 초기화
+            if context_manager:
+                # 사용자 관련 세션들 찾아서 초기화
+                user_sessions = [
+                    session_id for session_id in context_manager.session_conversations.keys()
+                    if f"user_{user_id}" in session_id
+                ]
+                
+                for session_id in user_sessions:
+                    context_manager.clear_session_context(session_id)
+                    logger.info(f"🗑️ 세션 컨텍스트 초기화: {session_id}")
+            
+            # Room 컨텍스트 초기화 (사용자 관련 문서 제거)
+            try:
+                room_context = integrated_memory_manager.room_context_manager.get_room_context(new_room_id)
+                if room_context and room_context.documents:
+                    # 사용자가 업로드한 문서들 제거
+                    original_count = len(room_context.documents)
+                    room_context.documents = [
+                        doc for doc in room_context.documents
+                        if (isinstance(doc, dict) and doc.get('uploaded_by') != user_id) or
+                           (hasattr(doc, 'uploaded_by') and getattr(doc, 'uploaded_by') != user_id)
+                    ]
+                    
+                    # 변경사항 저장
+                    integrated_memory_manager.room_context_manager.save_room_context(new_room_id, room_context)
+                    
+                    removed_count = original_count - len(room_context.documents)
+                    logger.info(f"��️ Room {new_room_id}에서 사용자 {user_id} 문서 {removed_count}개 제거")
+            except Exception as e:
+                logger.warning(f"⚠️ Room 컨텍스트 초기화 실패: {e}")
+            
+            return {
+                "status": "success",
+                "message": f"Room {new_room_id}로 이동 - 메모리 초기화됨",
+                "memory_preserved": False,
+                "context_cleared": True
+            }
+            
+    except Exception as e:
+        logger.error(f"❌ Room 변경 처리 실패: {e}")
+        return {"status": "error", "message": str(e)}
\ No newline at end of file