Spaces:

Sergidev
/

selflengthen

Runtime error

App Files Files Community

selflengthen / Dockerfile

Sergidev

Update Dockerfile

da440bd verified about 1 year ago

raw

history blame contribute delete

3.98 kB

	FROM nvidia/cuda:12.1.0-devel-ubuntu22.04

	# Install system dependencies
	RUN apt-get update && apt-get install -y \
	git \
	git-lfs \
	python3.10 \
	python3-pip \
	python-is-python3 \
	wget \
	ninja-build \
	gcc \
	g++ \
	&& rm -rf /var/lib/apt/lists/*

	WORKDIR /app

	# Create a non-root user
	RUN useradd -m -u 1000 user && \
	chown -R user:user /app

	# Install basic Python packages first
	RUN pip3 install --no-cache-dir \
	packaging \
	setuptools \
	wheel \
	numpy \
	torch==2.4.0

	# Install CUDA toolkit
	ENV CUDA_HOME=/usr/local/cuda
	ENV PATH=${CUDA_HOME}/bin:${PATH}
	ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}

	# Clone Self-Lengthen repository
	RUN git clone https://github.com/QwenLM/Self-Lengthen.git && \
	mv Self-Lengthen/* . && \
	rm -rf Self-Lengthen

	# Install dependencies in order
	COPY requirements.txt .
	RUN pip3 install --no-cache-dir \
	transformers==4.43.2 \
	accelerate \
	peft \
	datasets \
	sentencepiece \
	protobuf \
	tiktoken \
	scipy \
	gradio \
	cn2an>=0.5.22 \
	langdetect>=1.0.9 \
	openai \
	tqdm \
	&& pip3 install --no-cache-dir flash-attn --no-build-isolation \
	&& pip3 install --no-cache-dir vllm==0.5.5 vllm-flash-attn

	# Install FastChat
	RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \
	cd FastChat && \
	pip3 install ".[model_worker,webui]"

	# Install LLaMA Factory
	RUN pip3 install --no-cache-dir llamafactory

	# Create directories and set permissions
	RUN mkdir -p models results && \
	chown -R user:user /app

	# Switch to non-root user
	USER user

	# Initialize git-lfs
	RUN git lfs install

	# Set environment variables
	ENV CUDA_VISIBLE_DEVICES=0
	ENV WORLD_SIZE=1
	ENV RANK=0
	ENV MASTER_ADDR=localhost
	ENV MASTER_PORT=29500
	ENV MODEL_PATH=/app/models/base_model
	ENV INSTRUCT_COUNT=5000
	ENV MAX_ITER=3

	# Create startup script
	RUN echo '#!/bin/bash\n\
	\n\
	# Function to wait for service\n\
	wait_for_service() {\n\
	local host="$1"\n\
	local port="$2"\n\
	local retries=30\n\
	while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\
	retries=$((retries-1))\n\
	if [ "$retries" -eq 0 ]; then\n\
	echo "Service $host:$port is not available after maximum retries"\n\
	exit 1\n\
	fi\n\
	echo "Waiting for service $host:$port..."\n\
	sleep 2\n\
	done\n\
	}\n\
	\n\
	# Download model if needed\n\
	if [ ! -d "$MODEL_PATH" ]; then\n\
	echo "Downloading model..."\n\
	mkdir -p "$MODEL_PATH"\n\
	git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\
	fi\n\
	\n\
	# Start FastChat services\n\
	python -m fastchat.serve.controller \
	--host 0.0.0.0 \
	--port 21001 > controller.log 2>&1 &\n\
	\n\
	# Wait for controller\n\
	wait_for_service localhost 21001\n\
	\n\
	python -m fastchat.serve.openai_api_server \
	--controller-address http://localhost:21001 \
	--host 0.0.0.0 \
	--port 8000 > api_server.log 2>&1 &\n\
	\n\
	# Wait for API server\n\
	wait_for_service localhost 8000\n\
	\n\
	# Start model worker\n\
	python -m fastchat.serve.vllm_worker \
	--model-names Qwen/Qwen2-7B-Instruct \
	--model-path "$MODEL_PATH" \
	--controller-address http://localhost:21001 \
	--host localhost \
	--port 8080 \
	--worker-address http://localhost:8080 > worker.log 2>&1 &\n\
	\n\
	# Wait for model worker\n\
	wait_for_service localhost 8080\n\
	\n\
	# Run the training process\n\
	cd /app/qwen\n\
	bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\
	\n\
	# Start the web interface\n\
	python app.py\n' > /app/start.sh && \
	chmod +x /app/start.sh

	# Install netcat for service checking
	USER root
	RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/*
	USER user

	# Create a simple web interface
	COPY --chown=user:user app.py .

	# Expose port for web interface
	EXPOSE 7860 8000 21001 8080

	# Command to run
	ENTRYPOINT ["/app/start.sh"]