Spaces:

KillerKing93
/

Transformers-InferenceServer-OpenAPI

Running

Sync from GitHub 8f6d598

7cd14d8 verified 12 days ago

1.09 kB

	# Server
	PORT=3000

	# Model from Hugging Face (Transformers)
	MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking
	# HF token for gated/private models (optional)
	HF_TOKEN=

	# Inference parameters
	MAX_TOKENS=4096
	TEMPERATURE=0.7

	# Multimedia processing
	MAX_VIDEO_FRAMES=16

	# Transformers loading hints
	DEVICE_MAP=auto
	TORCH_DTYPE=auto
	# Persistent SSE session store (SQLite)
	# Enable to persist streaming chunks per session_id and allow resume after server restarts.
	# 1=true, 0=false
	PERSIST_SESSIONS=1
	SESSIONS_DB_PATH=sessions.db
	# TTL for sessions (seconds). Finished sessions older than TTL are garbage collected.
	SESSIONS_TTL_SECONDS=600
	# Auto compression and context reporting
	# Enable automatic prompt compression if context would overflow. Drops oldest non-system messages.
	ENABLE_AUTO_COMPRESSION=1
	# Force a max context window for budgeting; 0 = use model/tokenizer defaults
	CONTEXT_MAX_TOKENS_AUTO=0
	# Safety margin kept free for generation and special tokens
	CONTEXT_SAFETY_MARGIN=256
	# Compression strategy: truncate (default). summarize reserved for future use.
	COMPRESSION_STRATEGY=truncate