KillerKing93's picture
Sync from GitHub 8f6d598
7cd14d8 verified
raw
history blame
1.09 kB
# Server
PORT=3000
# Model from Hugging Face (Transformers)
MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking
# HF token for gated/private models (optional)
HF_TOKEN=
# Inference parameters
MAX_TOKENS=4096
TEMPERATURE=0.7
# Multimedia processing
MAX_VIDEO_FRAMES=16
# Transformers loading hints
DEVICE_MAP=auto
TORCH_DTYPE=auto
# Persistent SSE session store (SQLite)
# Enable to persist streaming chunks per session_id and allow resume after server restarts.
# 1=true, 0=false
PERSIST_SESSIONS=1
SESSIONS_DB_PATH=sessions.db
# TTL for sessions (seconds). Finished sessions older than TTL are garbage collected.
SESSIONS_TTL_SECONDS=600
# Auto compression and context reporting
# Enable automatic prompt compression if context would overflow. Drops oldest non-system messages.
ENABLE_AUTO_COMPRESSION=1
# Force a max context window for budgeting; 0 = use model/tokenizer defaults
CONTEXT_MAX_TOKENS_AUTO=0
# Safety margin kept free for generation and special tokens
CONTEXT_SAFETY_MARGIN=256
# Compression strategy: truncate (default). summarize reserved for future use.
COMPRESSION_STRATEGY=truncate