| # Server | |
| PORT=3000 | |
| # Model from Hugging Face (Transformers) | |
| MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking | |
| # HF token for gated/private models (optional) | |
| HF_TOKEN= | |
| # Inference parameters | |
| MAX_TOKENS=4096 | |
| TEMPERATURE=0.7 | |
| # Multimedia processing | |
| MAX_VIDEO_FRAMES=16 | |
| # Transformers loading hints | |
| DEVICE_MAP=auto | |
| TORCH_DTYPE=auto | |
| # Persistent SSE session store (SQLite) | |
| # Enable to persist streaming chunks per session_id and allow resume after server restarts. | |
| # 1=true, 0=false | |
| PERSIST_SESSIONS=1 | |
| SESSIONS_DB_PATH=sessions.db | |
| # TTL for sessions (seconds). Finished sessions older than TTL are garbage collected. | |
| SESSIONS_TTL_SECONDS=600 | |
| # Auto compression and context reporting | |
| # Enable automatic prompt compression if context would overflow. Drops oldest non-system messages. | |
| ENABLE_AUTO_COMPRESSION=1 | |
| # Force a max context window for budgeting; 0 = use model/tokenizer defaults | |
| CONTEXT_MAX_TOKENS_AUTO=0 | |
| # Safety margin kept free for generation and special tokens | |
| CONTEXT_SAFETY_MARGIN=256 | |
| # Compression strategy: truncate (default). summarize reserved for future use. | |
| COMPRESSION_STRATEGY=truncate |