Spaces:
Sleeping
Sleeping
Auto commit at 09-2025-08 6:57:03
Browse files- lily_llm_api/app_v2.py +12 -11
- lily_llm_api/models/kanana_1_5_v_3b_instruct.py +4 -2
- lily_llm_api/models/kanana_1_5_v_3b_instruct_250809_0055.py +3 -1
- lily_llm_core/config.py +3 -3
- test.py +5 -3
- test_hf_private.py +60 -0
lily_llm_api/app_v2.py
CHANGED
|
@@ -65,6 +65,7 @@ app.add_middleware(
|
|
| 65 |
"http://127.0.0.1:8001",
|
| 66 |
"http://localhost:3000",
|
| 67 |
"http://127.0.0.1:3000",
|
|
|
|
| 68 |
"*" # κ°λ° μ€μλ λͺ¨λ origin νμ©
|
| 69 |
],
|
| 70 |
allow_credentials=True,
|
|
@@ -485,7 +486,7 @@ async def generate_multimodal(
|
|
| 485 |
return_tensors="pt",
|
| 486 |
padding=True,
|
| 487 |
truncation=True,
|
| 488 |
-
max_length=
|
| 489 |
)
|
| 490 |
|
| 491 |
if 'token_type_ids' in inputs:
|
|
@@ -557,10 +558,10 @@ async def generate_multimodal(
|
|
| 557 |
top_k=40,
|
| 558 |
top_p=top_p,
|
| 559 |
repetition_penalty=1.1,
|
| 560 |
-
no_repeat_ngram_size=2,
|
| 561 |
pad_token_id=tokenizer.eos_token_id,
|
| 562 |
eos_token_id=tokenizer.eos_token_id,
|
| 563 |
-
use_cache=True
|
| 564 |
)
|
| 565 |
logger.info("β
μ€μ λ©ν°λͺ¨λ¬ μμ± μ±κ³΅!")
|
| 566 |
|
|
@@ -578,9 +579,9 @@ async def generate_multimodal(
|
|
| 578 |
enhanced_inputs = tokenizer(
|
| 579 |
enhanced_formatted_prompt,
|
| 580 |
return_tensors="pt",
|
| 581 |
-
padding=True,
|
| 582 |
-
truncation=True,
|
| 583 |
-
max_length=
|
| 584 |
)
|
| 585 |
|
| 586 |
if 'token_type_ids' in enhanced_inputs:
|
|
@@ -597,10 +598,10 @@ async def generate_multimodal(
|
|
| 597 |
top_k=40,
|
| 598 |
top_p=top_p,
|
| 599 |
repetition_penalty=1.1,
|
| 600 |
-
no_repeat_ngram_size=2,
|
| 601 |
pad_token_id=tokenizer.eos_token_id,
|
| 602 |
eos_token_id=tokenizer.eos_token_id,
|
| 603 |
-
use_cache=True
|
| 604 |
)
|
| 605 |
else:
|
| 606 |
# ν
μ€νΈ μ μ© μμ±
|
|
@@ -613,11 +614,11 @@ async def generate_multimodal(
|
|
| 613 |
temperature=temperature,
|
| 614 |
top_k=40,
|
| 615 |
top_p=top_p,
|
| 616 |
-
repetition_penalty=1.1,
|
| 617 |
-
no_repeat_ngram_size=2,
|
| 618 |
pad_token_id=tokenizer.eos_token_id,
|
| 619 |
eos_token_id=tokenizer.eos_token_id,
|
| 620 |
-
use_cache=True
|
| 621 |
)
|
| 622 |
|
| 623 |
# μλ΅ μΆμΆ
|
|
|
|
| 65 |
"http://127.0.0.1:8001",
|
| 66 |
"http://localhost:3000",
|
| 67 |
"http://127.0.0.1:3000",
|
| 68 |
+
"https://hearthchat-production.up.railway.app",
|
| 69 |
"*" # κ°λ° μ€μλ λͺ¨λ origin νμ©
|
| 70 |
],
|
| 71 |
allow_credentials=True,
|
|
|
|
| 486 |
return_tensors="pt",
|
| 487 |
padding=True,
|
| 488 |
truncation=True,
|
| 489 |
+
max_length=max_length,
|
| 490 |
)
|
| 491 |
|
| 492 |
if 'token_type_ids' in inputs:
|
|
|
|
| 558 |
top_k=40,
|
| 559 |
top_p=top_p,
|
| 560 |
repetition_penalty=1.1,
|
| 561 |
+
# no_repeat_ngram_size=2,
|
| 562 |
pad_token_id=tokenizer.eos_token_id,
|
| 563 |
eos_token_id=tokenizer.eos_token_id,
|
| 564 |
+
# use_cache=True
|
| 565 |
)
|
| 566 |
logger.info("β
μ€μ λ©ν°λͺ¨λ¬ μμ± μ±κ³΅!")
|
| 567 |
|
|
|
|
| 579 |
enhanced_inputs = tokenizer(
|
| 580 |
enhanced_formatted_prompt,
|
| 581 |
return_tensors="pt",
|
| 582 |
+
# padding=True,
|
| 583 |
+
# truncation=True,
|
| 584 |
+
max_length=max_length
|
| 585 |
)
|
| 586 |
|
| 587 |
if 'token_type_ids' in enhanced_inputs:
|
|
|
|
| 598 |
top_k=40,
|
| 599 |
top_p=top_p,
|
| 600 |
repetition_penalty=1.1,
|
| 601 |
+
# no_repeat_ngram_size=2,
|
| 602 |
pad_token_id=tokenizer.eos_token_id,
|
| 603 |
eos_token_id=tokenizer.eos_token_id,
|
| 604 |
+
# use_cache=True
|
| 605 |
)
|
| 606 |
else:
|
| 607 |
# ν
μ€νΈ μ μ© μμ±
|
|
|
|
| 614 |
temperature=temperature,
|
| 615 |
top_k=40,
|
| 616 |
top_p=top_p,
|
| 617 |
+
# repetition_penalty=1.1,
|
| 618 |
+
# no_repeat_ngram_size=2,
|
| 619 |
pad_token_id=tokenizer.eos_token_id,
|
| 620 |
eos_token_id=tokenizer.eos_token_id,
|
| 621 |
+
# use_cache=True
|
| 622 |
)
|
| 623 |
|
| 624 |
# μλ΅ μΆμΆ
|
lily_llm_api/models/kanana_1_5_v_3b_instruct.py
CHANGED
|
@@ -16,6 +16,8 @@ HF_TOKEN = os.getenv("HF_TOKEN")
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
|
|
|
|
|
|
|
| 19 |
class Kanana15V3bInstructProfile:
|
| 20 |
"""Kanana-1.5-v-3b-instruct λͺ¨λΈ νλ‘ν"""
|
| 21 |
|
|
@@ -181,7 +183,7 @@ class Kanana15V3bInstructProfile:
|
|
| 181 |
token=HF_TOKEN,
|
| 182 |
torch_dtype=torch.float16,
|
| 183 |
trust_remote_code=True,
|
| 184 |
-
cache_dir="/app/cache/transformers",
|
| 185 |
# device_map="auto",
|
| 186 |
# low_cpu_mem_usage=True,
|
| 187 |
).to(DEVICE)
|
|
@@ -197,7 +199,7 @@ class Kanana15V3bInstructProfile:
|
|
| 197 |
|
| 198 |
def get_generation_config(self) -> Dict[str, Any]:
|
| 199 |
# λͺ¨λΈ νλΌλ―Έν° μ΅μ ν μ€μ , max_new_tokens : μμ±λλ ν
μ€νΈ κΈΈμ΄ μ΅λκ° (μ΄λ―Έμ§ μ€λͺ
μ μν΄ μ¦κ°)
|
| 200 |
-
return {"max_new_tokens":
|
| 201 |
|
| 202 |
def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:
|
| 203 |
"""
|
|
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
|
| 19 |
+
max_new_tokens = 64
|
| 20 |
+
|
| 21 |
class Kanana15V3bInstructProfile:
|
| 22 |
"""Kanana-1.5-v-3b-instruct λͺ¨λΈ νλ‘ν"""
|
| 23 |
|
|
|
|
| 183 |
token=HF_TOKEN,
|
| 184 |
torch_dtype=torch.float16,
|
| 185 |
trust_remote_code=True,
|
| 186 |
+
# cache_dir="/app/cache/transformers",
|
| 187 |
# device_map="auto",
|
| 188 |
# low_cpu_mem_usage=True,
|
| 189 |
).to(DEVICE)
|
|
|
|
| 199 |
|
| 200 |
def get_generation_config(self) -> Dict[str, Any]:
|
| 201 |
# λͺ¨λΈ νλΌλ―Έν° μ΅μ ν μ€μ , max_new_tokens : μμ±λλ ν
μ€νΈ κΈΈμ΄ μ΅λκ° (μ΄λ―Έμ§ μ€λͺ
μ μν΄ μ¦κ°)
|
| 202 |
+
return {"max_new_tokens": max_new_tokens, "temperature": 0.7, "do_sample": True, "top_k": 40, "top_p": 0.9, "repetition_penalty": 1.1}
|
| 203 |
|
| 204 |
def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:
|
| 205 |
"""
|
lily_llm_api/models/kanana_1_5_v_3b_instruct_250809_0055.py
CHANGED
|
@@ -16,6 +16,8 @@ HF_TOKEN = os.getenv("HF_TOKEN")
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
|
|
|
|
|
|
|
| 19 |
class Kanana15V3bInstructProfile:
|
| 20 |
"""Kanana-1.5-v-3b-instruct λͺ¨λΈ νλ‘ν"""
|
| 21 |
|
|
@@ -175,7 +177,7 @@ class Kanana15V3bInstructProfile:
|
|
| 175 |
|
| 176 |
def get_generation_config(self) -> Dict[str, Any]:
|
| 177 |
# λͺ¨λΈ νλΌλ―Έν° μ΅μ ν μ€μ , max_new_tokens : μμ±λλ ν
μ€νΈ κΈΈμ΄ μ΅λκ° (μ΄λ―Έμ§ μ€λͺ
μ μν΄ μ¦κ°)
|
| 178 |
-
return {"max_new_tokens":
|
| 179 |
|
| 180 |
def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:
|
| 181 |
"""
|
|
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
|
| 19 |
+
max_new_tokens = 64
|
| 20 |
+
|
| 21 |
class Kanana15V3bInstructProfile:
|
| 22 |
"""Kanana-1.5-v-3b-instruct λͺ¨λΈ νλ‘ν"""
|
| 23 |
|
|
|
|
| 177 |
|
| 178 |
def get_generation_config(self) -> Dict[str, Any]:
|
| 179 |
# λͺ¨λΈ νλΌλ―Έν° μ΅μ ν μ€μ , max_new_tokens : μμ±λλ ν
μ€νΈ κΈΈμ΄ μ΅λκ° (μ΄λ―Έμ§ μ€λͺ
μ μν΄ μ¦κ°)
|
| 180 |
+
return {"max_new_tokens": max_new_tokens, "temperature": 0.7, "do_sample": True, "top_k": 40, "top_p": 0.9, "repetition_penalty": 1.1}
|
| 181 |
|
| 182 |
def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:
|
| 183 |
"""
|
lily_llm_core/config.py
CHANGED
|
@@ -27,16 +27,16 @@ class DatabaseSettings(BaseSettings):
|
|
| 27 |
class ModelSettings(BaseSettings):
|
| 28 |
"""λͺ¨λΈ μ€μ """
|
| 29 |
|
| 30 |
-
default_model: str = Field(default="
|
| 31 |
model_cache_dir: str = Field(default="./models", description="λͺ¨λΈ μΊμ λλ ν 리")
|
| 32 |
model_download_timeout: int = Field(default=3600, description="λͺ¨λΈ λ€μ΄λ‘λ νμμμ (μ΄)")
|
| 33 |
model_max_memory: str = Field(default="4GB", description="λͺ¨λΈ μ΅λ λ©λͺ¨λ¦¬ μ¬μ©λ")
|
| 34 |
model_device: str = Field(default="auto", description="λͺ¨λΈ μ€ν λλ°μ΄μ€ (auto/cpu/cuda)")
|
| 35 |
model_precision: str = Field(default="float16", description="λͺ¨λΈ μ λ°λ (float16/float32/bfloat16)")
|
| 36 |
|
| 37 |
-
# λͺ¨λΈλ³ μ€μ
|
|
|
|
| 38 |
polyglot_ko_1_3b_chat_model_path: str = Field(default="./models/polyglot-ko-1.3b-chat", description="Polyglot 1.3b λͺ¨λΈ κ²½λ‘")
|
| 39 |
-
kanana_1_5_v_3b_instruct_model_path: str = Field(default="./models/kanana_1_5_v_3b_instruct", description="Kanana 1.5 v 3b λͺ¨λΈ κ²½λ‘")
|
| 40 |
polyglot_ko_5_8b_chat_model_path: str = Field(default="./models/polyglot-ko-5.8b-chat", description="Polyglot 5.8b λͺ¨λΈ κ²½λ‘")
|
| 41 |
|
| 42 |
class Config:
|
|
|
|
| 27 |
class ModelSettings(BaseSettings):
|
| 28 |
"""λͺ¨λΈ μ€μ """
|
| 29 |
|
| 30 |
+
default_model: str = Field(default="kanana-1.5-v-3b-instruct", description="κΈ°λ³Έ λͺ¨λΈ")
|
| 31 |
model_cache_dir: str = Field(default="./models", description="λͺ¨λΈ μΊμ λλ ν 리")
|
| 32 |
model_download_timeout: int = Field(default=3600, description="λͺ¨λΈ λ€μ΄λ‘λ νμμμ (μ΄)")
|
| 33 |
model_max_memory: str = Field(default="4GB", description="λͺ¨λΈ μ΅λ λ©λͺ¨λ¦¬ μ¬μ©λ")
|
| 34 |
model_device: str = Field(default="auto", description="λͺ¨λΈ μ€ν λλ°μ΄μ€ (auto/cpu/cuda)")
|
| 35 |
model_precision: str = Field(default="float16", description="λͺ¨λΈ μ λ°λ (float16/float32/bfloat16)")
|
| 36 |
|
| 37 |
+
# λͺ¨λΈλ³ μ€μ
|
| 38 |
+
kanana_1_5_v_3b_instruct_model_path: str = Field(default="./models/kanana_1_5_v_3b_instruct", description="Kanana 1.5 v 3b λͺ¨λΈ κ²½λ‘")
|
| 39 |
polyglot_ko_1_3b_chat_model_path: str = Field(default="./models/polyglot-ko-1.3b-chat", description="Polyglot 1.3b λͺ¨λΈ κ²½λ‘")
|
|
|
|
| 40 |
polyglot_ko_5_8b_chat_model_path: str = Field(default="./models/polyglot-ko-5.8b-chat", description="Polyglot 5.8b λͺ¨λΈ κ²½λ‘")
|
| 41 |
|
| 42 |
class Config:
|
test.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
import requests
|
| 2 |
import json
|
| 3 |
import os # os λͺ¨λ μΆκ°
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# 1. νκ²½ λ³μμμ νκΉ
νμ΄μ€ ν ν°μ κ°μ Έμ΅λλ€.
|
| 6 |
# ν°λ―Έλμμ `set HUGGING_FACE_TOKEN=hf_...` (Windows) λλ
|
| 7 |
# `export HUGGING_FACE_TOKEN=hf_...` (Mac/Linux) λͺ
λ ΉμΌλ‘ 미리 μ€μ ν©λλ€.
|
| 8 |
-
HF_TOKEN = os.getenv("
|
| 9 |
|
| 10 |
# νκΉ
νμ΄μ€ FastAPI μλ² URL
|
| 11 |
HF_API_BASE = "https://gbrabbit-lily-fast-api.hf.space"
|
|
@@ -26,7 +28,7 @@ def test_generate_text():
|
|
| 26 |
|
| 27 |
data = {
|
| 28 |
'prompt': 'μλ
νμΈμ! Private μ€νμ΄μ€μμ μ μ§λ΄μλμ?',
|
| 29 |
-
'max_length':
|
| 30 |
}
|
| 31 |
|
| 32 |
print(f"π€ μμ² λ°μ΄ν° (Form): {json.dumps(data, ensure_ascii=False)}")
|
|
@@ -36,7 +38,7 @@ def test_generate_text():
|
|
| 36 |
f"{HF_API_BASE}/generate",
|
| 37 |
headers=headers, # <<-- μΈμ¦ ν€λ μΆκ°!
|
| 38 |
data=data,
|
| 39 |
-
timeout=
|
| 40 |
)
|
| 41 |
|
| 42 |
print(f"β
μν μ½λ: {response.status_code}") # μ΄μ 200μ΄ νμλ κ²μ
λλ€.
|
|
|
|
| 1 |
import requests
|
| 2 |
import json
|
| 3 |
import os # os λͺ¨λ μΆκ°
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
|
| 7 |
# 1. νκ²½ λ³μμμ νκΉ
νμ΄μ€ ν ν°μ κ°μ Έμ΅λλ€.
|
| 8 |
# ν°λ―Έλμμ `set HUGGING_FACE_TOKEN=hf_...` (Windows) λλ
|
| 9 |
# `export HUGGING_FACE_TOKEN=hf_...` (Mac/Linux) λͺ
λ ΉμΌλ‘ 미리 μ€μ ν©λλ€.
|
| 10 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 11 |
|
| 12 |
# νκΉ
νμ΄μ€ FastAPI μλ² URL
|
| 13 |
HF_API_BASE = "https://gbrabbit-lily-fast-api.hf.space"
|
|
|
|
| 28 |
|
| 29 |
data = {
|
| 30 |
'prompt': 'μλ
νμΈμ! Private μ€νμ΄μ€μμ μ μ§λ΄μλμ?',
|
| 31 |
+
'max_length': 20
|
| 32 |
}
|
| 33 |
|
| 34 |
print(f"π€ μμ² λ°μ΄ν° (Form): {json.dumps(data, ensure_ascii=False)}")
|
|
|
|
| 38 |
f"{HF_API_BASE}/generate",
|
| 39 |
headers=headers, # <<-- μΈμ¦ ν€λ μΆκ°!
|
| 40 |
data=data,
|
| 41 |
+
timeout=2000
|
| 42 |
)
|
| 43 |
|
| 44 |
print(f"β
μν μ½λ: {response.status_code}") # μ΄μ 200μ΄ νμλ κ²μ
λλ€.
|
test_hf_private.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
import os # os λͺ¨λ μΆκ°
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# 1. νκ²½ λ³μμμ νκΉ
νμ΄μ€ ν ν°μ κ°μ Έμ΅λλ€.
|
| 8 |
+
# ν°λ―Έλμμ `set HUGGING_FACE_TOKEN=hf_...` (Windows) λλ
|
| 9 |
+
# `export HUGGING_FACE_TOKEN=hf_...` (Mac/Linux) λͺ
λ ΉμΌλ‘ 미리 μ€μ ν©λλ€.
|
| 10 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 11 |
+
|
| 12 |
+
# νκΉ
νμ΄μ€ FastAPI μλ² URL
|
| 13 |
+
HF_API_BASE = "https://gbrabbit-lily-fast-api.hf.space"
|
| 14 |
+
|
| 15 |
+
def test_generate_text():
|
| 16 |
+
"""ν
μ€νΈ μμ± ν
μ€νΈ (μΈμ¦ μΆκ°)"""
|
| 17 |
+
print("\nπ ν
μ€νΈ μμ± ν
μ€νΈ...")
|
| 18 |
+
|
| 19 |
+
if not HF_TOKEN:
|
| 20 |
+
print("β HUGGING_FACE_TOKEN νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€.")
|
| 21 |
+
return False
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# 2. μΈμ¦ ν ν°μ λ΄μ ν€λ(headers)λ₯Ό μμ±ν©λλ€.
|
| 25 |
+
headers = {
|
| 26 |
+
"Authorization": f"Bearer {HF_TOKEN}"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
data = {
|
| 30 |
+
'prompt': 'μλ
νμΈμ! Private μ€νμ΄μ€μμ μ μ§λ΄μλμ?',
|
| 31 |
+
'max_length': 128
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
print(f"π€ μμ² λ°μ΄ν° (Form): {json.dumps(data, ensure_ascii=False)}")
|
| 35 |
+
|
| 36 |
+
# 3. requests.post νΈμΆ μ headers νλΌλ―Έν°λ₯Ό μΆκ°ν©λλ€.
|
| 37 |
+
response = requests.post(
|
| 38 |
+
f"{HF_API_BASE}/generate",
|
| 39 |
+
headers=headers, # <<-- μΈμ¦ ν€λ μΆκ°!
|
| 40 |
+
data=data,
|
| 41 |
+
timeout=500
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
print(f"β
μν μ½λ: {response.status_code}") # μ΄μ 200μ΄ νμλ κ²μ
λλ€.
|
| 45 |
+
|
| 46 |
+
if response.status_code == 200:
|
| 47 |
+
result = response.json()
|
| 48 |
+
print(f"β
μλ΅: {json.dumps(result, indent=2, ensure_ascii=False)}")
|
| 49 |
+
else:
|
| 50 |
+
print(f"β μλ΅: {response.text}")
|
| 51 |
+
|
| 52 |
+
return response.status_code == 200
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"β ν
μ€νΈ μμ± ν
μ€νΈ μ€ν¨: {e}")
|
| 56 |
+
return False
|
| 57 |
+
|
| 58 |
+
# μ€ν¬λ¦½νΈ μ€ν
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
test_generate_text()
|