Spaces:
Runtime error
Runtime error
| FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 | |
| # Install system dependencies | |
| RUN apt-get update && apt-get install -y \ | |
| git \ | |
| git-lfs \ | |
| python3.10 \ | |
| python3-pip \ | |
| python-is-python3 \ | |
| wget \ | |
| ninja-build \ | |
| gcc \ | |
| g++ \ | |
| && rm -rf /var/lib/apt/lists/* | |
| WORKDIR /app | |
| # Create a non-root user | |
| RUN useradd -m -u 1000 user && \ | |
| chown -R user:user /app | |
| # Install basic Python packages first | |
| RUN pip3 install --no-cache-dir \ | |
| packaging \ | |
| setuptools \ | |
| wheel \ | |
| numpy \ | |
| torch==2.4.0 | |
| # Install CUDA toolkit | |
| ENV CUDA_HOME=/usr/local/cuda | |
| ENV PATH=${CUDA_HOME}/bin:${PATH} | |
| ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} | |
| # Clone Self-Lengthen repository | |
| RUN git clone https://github.com/QwenLM/Self-Lengthen.git && \ | |
| mv Self-Lengthen/* . && \ | |
| rm -rf Self-Lengthen | |
| # Install dependencies in order | |
| COPY requirements.txt . | |
| RUN pip3 install --no-cache-dir \ | |
| transformers==4.43.2 \ | |
| accelerate \ | |
| peft \ | |
| datasets \ | |
| sentencepiece \ | |
| protobuf \ | |
| tiktoken \ | |
| scipy \ | |
| gradio \ | |
| cn2an>=0.5.22 \ | |
| langdetect>=1.0.9 \ | |
| openai \ | |
| tqdm \ | |
| && pip3 install --no-cache-dir flash-attn --no-build-isolation \ | |
| && pip3 install --no-cache-dir vllm==0.5.5 vllm-flash-attn | |
| # Install FastChat | |
| RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \ | |
| cd FastChat && \ | |
| pip3 install ".[model_worker,webui]" | |
| # Install LLaMA Factory | |
| RUN pip3 install --no-cache-dir llamafactory | |
| # Create directories and set permissions | |
| RUN mkdir -p models results && \ | |
| chown -R user:user /app | |
| # Switch to non-root user | |
| USER user | |
| # Initialize git-lfs | |
| RUN git lfs install | |
| # Set environment variables | |
| ENV CUDA_VISIBLE_DEVICES=0 | |
| ENV WORLD_SIZE=1 | |
| ENV RANK=0 | |
| ENV MASTER_ADDR=localhost | |
| ENV MASTER_PORT=29500 | |
| ENV MODEL_PATH=/app/models/base_model | |
| ENV INSTRUCT_COUNT=5000 | |
| ENV MAX_ITER=3 | |
| # Create startup script | |
| RUN echo '#!/bin/bash\n\ | |
| \n\ | |
| # Function to wait for service\n\ | |
| wait_for_service() {\n\ | |
| local host="$1"\n\ | |
| local port="$2"\n\ | |
| local retries=30\n\ | |
| while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\ | |
| retries=$((retries-1))\n\ | |
| if [ "$retries" -eq 0 ]; then\n\ | |
| echo "Service $host:$port is not available after maximum retries"\n\ | |
| exit 1\n\ | |
| fi\n\ | |
| echo "Waiting for service $host:$port..."\n\ | |
| sleep 2\n\ | |
| done\n\ | |
| }\n\ | |
| \n\ | |
| # Download model if needed\n\ | |
| if [ ! -d "$MODEL_PATH" ]; then\n\ | |
| echo "Downloading model..."\n\ | |
| mkdir -p "$MODEL_PATH"\n\ | |
| git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\ | |
| fi\n\ | |
| \n\ | |
| # Start FastChat services\n\ | |
| python -m fastchat.serve.controller \ | |
| --host 0.0.0.0 \ | |
| --port 21001 > controller.log 2>&1 &\n\ | |
| \n\ | |
| # Wait for controller\n\ | |
| wait_for_service localhost 21001\n\ | |
| \n\ | |
| python -m fastchat.serve.openai_api_server \ | |
| --controller-address http://localhost:21001 \ | |
| --host 0.0.0.0 \ | |
| --port 8000 > api_server.log 2>&1 &\n\ | |
| \n\ | |
| # Wait for API server\n\ | |
| wait_for_service localhost 8000\n\ | |
| \n\ | |
| # Start model worker\n\ | |
| python -m fastchat.serve.vllm_worker \ | |
| --model-names Qwen/Qwen2-7B-Instruct \ | |
| --model-path "$MODEL_PATH" \ | |
| --controller-address http://localhost:21001 \ | |
| --host localhost \ | |
| --port 8080 \ | |
| --worker-address http://localhost:8080 > worker.log 2>&1 &\n\ | |
| \n\ | |
| # Wait for model worker\n\ | |
| wait_for_service localhost 8080\n\ | |
| \n\ | |
| # Run the training process\n\ | |
| cd /app/qwen\n\ | |
| bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\ | |
| \n\ | |
| # Start the web interface\n\ | |
| python app.py\n' > /app/start.sh && \ | |
| chmod +x /app/start.sh | |
| # Install netcat for service checking | |
| USER root | |
| RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/* | |
| USER user | |
| # Create a simple web interface | |
| COPY --chown=user:user app.py . | |
| # Expose port for web interface | |
| EXPOSE 7860 8000 21001 8080 | |
| # Command to run | |
| ENTRYPOINT ["/app/start.sh"] |