Spaces:

code-slicer
/

chatbotMOAI

Sleeping

App Files Files Community

code-slicer commited on Sep 10

Commit

38845c7

verified ·

1 Parent(s): 8662746

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -15

app.py CHANGED Viewed

@@ -156,6 +156,13 @@ OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
 OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "gemma2:9b")
 OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "300"))
 KOREAN_SYSTEM_PROMPT = """당신은 한국어 어시스턴트입니다. 항상 한국어로 답하세요."""
@@ -250,16 +257,11 @@ def _call_ollama_chat(messages, model=OLLAMA_MODEL, temperature=0.8, top_p=0.9,
 def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
-                       temperature: float = 0.8, top_p: float = 0.9,
-                       top_k: int = 40, repeat_penalty: float = 1.1,
-                       num_predict: int = 200, num_ctx: int = 2048,
                        system_prompt: str | None = None):
-    """
-    Ollama /api/chat 스트리밍 제너레이터.
-    Streamlit에서는 st.write_stream(...)으로 바로 쓸 수 있음.
-    """
     url = f"{OLLAMA_HOST}/api/chat"
     _msgs = []
     if system_prompt:
         _msgs.append({"role": "system", "content": system_prompt})
@@ -273,10 +275,12 @@ def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
             "top_p": top_p,
             "top_k": top_k,
             "repeat_penalty": repeat_penalty,
-            "num_predict": num_predict,   # CPU + 9B는 128~256 권장
-            "num_ctx": num_ctx            # 2048~4096
         },
-        "stream": True,                   # ✅ 핵심
     }
     with requests.post(url, json=payload, stream=True, timeout=OLLAMA_TIMEOUT) as resp:
@@ -339,18 +343,21 @@ def render_llm_followup(chat_container, inline=False):
     st.session_state.setdefault("llm_msgs", [])
     st.session_state["llm_msgs"].append({"role": "user", "content": text})
     # ✅ 스트리밍 호출로 변경
     try:
         with st.chat_message("assistant"):
-            # 시스템 프롬프트 + 히스토리 모두 보내기
-            msgs = st.session_state["llm_msgs"]
             full_text = st.write_stream(
                 call_ollama_stream(
                     msgs,
                     model=OLLAMA_MODEL,
                     system_prompt=KOREAN_SYSTEM_PROMPT,
-                    num_predict=200,   # 필요시 128~256 조정
-                    num_ctx=2048
                 )
             )
         st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})

 OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "gemma2:9b")
 OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "300"))
+# --- CPU 속도 튜닝 파라미터 ---
+FAST_MODE = True
+LLM_THREADS = int(os.getenv("LLM_THREADS", str(os.cpu_count() or 8)))  # 필요시 ENV로 덮어쓰기
+LLM_NUM_PREDICT = 80 if FAST_MODE else 200   # 생성 토큰 상한 (CPU 9B는 80~128이 쾌적)
+LLM_NUM_CTX = 1024 if FAST_MODE else 2048    # 컨텍스트 창 (작을수록 빨라짐)
+HISTORY_WINDOW = 4                           # 최근 N개의 메시지만 LLM에 전달
 KOREAN_SYSTEM_PROMPT = """당신은 한국어 어시스턴트입니다. 항상 한국어로 답하세요."""
 def call_ollama_stream(messages, *, model: str = OLLAMA_MODEL,
+                       temperature: float = 0.7, top_p: float = 0.9,
+                       top_k: int = 20, repeat_penalty: float = 1.1,
+                       num_predict: int = LLM_NUM_PREDICT, num_ctx: int = LLM_NUM_CTX,
                        system_prompt: str | None = None):
     url = f"{OLLAMA_HOST}/api/chat"
     _msgs = []
     if system_prompt:
         _msgs.append({"role": "system", "content": system_prompt})
             "top_p": top_p,
             "top_k": top_k,
             "repeat_penalty": repeat_penalty,
+            "num_predict": num_predict,
+            "num_ctx": num_ctx,
+            "num_thread": LLM_THREADS,   # ✅ CPU 스레드 수
+            # "use_mmap": True,          # (옵션) 첫 토큰 지연 줄이기 시도
         },
+        "stream": True,
     }
     with requests.post(url, json=payload, stream=True, timeout=OLLAMA_TIMEOUT) as resp:
     st.session_state.setdefault("llm_msgs", [])
     st.session_state["llm_msgs"].append({"role": "user", "content": text})
+    # ✅ 히스토리 슬라이스(최근 N개만 전송)
+    def _last_msgs(n=HISTORY_WINDOW):
+        hist = st.session_state["llm_msgs"]
+        return hist[-n:] if len(hist) > n else hist
     # ✅ 스트리밍 호출로 변경
     try:
         with st.chat_message("assistant"):
+            msgs = _last_msgs()  # ⬅️ 여기!
             full_text = st.write_stream(
                 call_ollama_stream(
                     msgs,
                     model=OLLAMA_MODEL,
                     system_prompt=KOREAN_SYSTEM_PROMPT,
+                    # num_predict/num_ctx는 기본값(상단 상수) 사용
                 )
             )
         st.session_state["llm_msgs"].append({"role": "assistant", "content": full_text})