if [[ "$QUANTIZATION" == "false" ]]; then | |
text-generation-launcher --model-id $MODEL_NAME \ | |
--num-shard 1 --port 8080 --trust-remote-code \ | |
--max-concurrent-requests $MAX_CONCURRENT_REQUESTS \ | |
--max-input-length $MAX_INPUT_LENGTH \ | |
--max-total-tokens $MAX_TOTAL_TOKENS \ | |
& | |
else | |
text-embeddings-router --model-id $MODEL_NAME \ | |
--port 8080 \ | |
--max-concurrent-requests $MAX_CONCURRENT_REQUESTS \ | |
--dtype $DTYPE \ | |
& | |
fi | |
# Wait for text-embedding-inference to start | |
curl --retry 60 --retry-delay 10 --retry-connrefused http://127.0.0.1:8080/health | |
# Start the gradio | |
python3 app/main.py --port $GRADIO_PORT & | |
# Wait for any process to exit | |
wait -n | |
# Exit with status of process that exited first | |
exit $? |