Spaces:
Build error
Build error
fullstuckdev
commited on
Commit
•
e7ceaff
1
Parent(s):
2e7d4a0
fixing
Browse files- Dockerfile +24 -11
- README.md +18 -7
- app.py +51 -125
- docker-compose.yml +16 -0
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -1,33 +1,46 @@
|
|
1 |
-
# Use NVIDIA CUDA base image
|
2 |
-
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
# Set working directory
|
5 |
WORKDIR /app
|
6 |
|
7 |
# Install system dependencies
|
8 |
RUN apt-get update && apt-get install -y \
|
9 |
-
python3 \
|
10 |
python3-pip \
|
11 |
git \
|
12 |
&& rm -rf /var/lib/apt/lists/*
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
# Copy requirements first to leverage Docker cache
|
15 |
COPY requirements.txt .
|
16 |
|
17 |
-
#
|
18 |
-
RUN
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# Copy the rest of the application
|
21 |
COPY . .
|
22 |
|
23 |
-
# Create model directory
|
24 |
-
RUN mkdir -p /app/model/medical_llama_3b
|
25 |
-
|
26 |
# Expose port
|
27 |
EXPOSE 7860
|
28 |
|
29 |
-
# Set environment variables
|
30 |
-
ENV
|
|
|
31 |
|
32 |
# Command to run the application
|
33 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
+
# Use NVIDIA CUDA base image with Python
|
2 |
+
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
6 |
+
ENV TRANSFORMERS_CACHE=/app/cache
|
7 |
+
ENV PYTHONUNBUFFERED=1
|
8 |
+
ENV PORT=7860
|
9 |
|
10 |
# Set working directory
|
11 |
WORKDIR /app
|
12 |
|
13 |
# Install system dependencies
|
14 |
RUN apt-get update && apt-get install -y \
|
15 |
+
python3.10 \
|
16 |
python3-pip \
|
17 |
git \
|
18 |
&& rm -rf /var/lib/apt/lists/*
|
19 |
|
20 |
+
# Create cache directory and set permissions
|
21 |
+
RUN mkdir -p /app/cache && \
|
22 |
+
mkdir -p /app/model/medical_llama_3b && \
|
23 |
+
chmod -R 777 /app/cache
|
24 |
+
|
25 |
# Copy requirements first to leverage Docker cache
|
26 |
COPY requirements.txt .
|
27 |
|
28 |
+
# Update pip and install dependencies
|
29 |
+
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
30 |
+
pip install --no-cache-dir -r requirements.txt
|
31 |
+
|
32 |
+
# Install specific numpy version to fix compatibility
|
33 |
+
RUN pip install --no-cache-dir "numpy<2.0.0"
|
34 |
|
35 |
# Copy the rest of the application
|
36 |
COPY . .
|
37 |
|
|
|
|
|
|
|
38 |
# Expose port
|
39 |
EXPOSE 7860
|
40 |
|
41 |
+
# Set environment variables for GPU
|
42 |
+
ENV NVIDIA_VISIBLE_DEVICES=all
|
43 |
+
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
44 |
|
45 |
# Command to run the application
|
46 |
+
CMD ["python3", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,12 +1,23 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
7 |
pinned: false
|
8 |
-
|
9 |
-
app_port: 8000
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Medical LLaMA API
|
3 |
+
emoji: 🏥
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
pinned: false
|
9 |
+
nvidia: true
|
|
|
10 |
---
|
11 |
|
12 |
+
# Medical LLaMA API
|
13 |
+
|
14 |
+
This is a FastAPI application serving a fine-tuned LLaMA model for medical queries.
|
15 |
+
|
16 |
+
## Hardware Requirements
|
17 |
+
- GPU: Required
|
18 |
+
- Memory: 16GB minimum
|
19 |
+
- Storage: 30GB minimum
|
20 |
+
|
21 |
+
## Environment Variables
|
22 |
+
- `MODEL_PATH`: Path to the model files
|
23 |
+
- `TRANSFORMERS_CACHE`: Cache directory for Hugging Face
|
app.py
CHANGED
@@ -1,138 +1,64 @@
|
|
1 |
import os
|
2 |
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
3 |
-
from
|
4 |
-
from typing import List, Optional
|
5 |
import torch
|
6 |
-
from
|
7 |
-
|
8 |
-
AutoTokenizer,
|
9 |
-
AutoModelForCausalLM,
|
10 |
-
TrainingArguments,
|
11 |
-
Trainer,
|
12 |
-
DataCollatorForLanguageModeling
|
13 |
-
)
|
14 |
-
import uvicorn
|
15 |
-
|
16 |
-
app = FastAPI(title="Medical LLaMA API")
|
17 |
-
|
18 |
-
model = None
|
19 |
-
tokenizer = None
|
20 |
-
model_output_path = "./model/medical_llama_3b"
|
21 |
-
|
22 |
-
class TrainRequest(BaseModel):
|
23 |
-
dataset_path: str
|
24 |
-
num_epochs: int = 3
|
25 |
-
batch_size: int = 4
|
26 |
-
learning_rate: float = 2e-5
|
27 |
-
|
28 |
-
class Query(BaseModel):
|
29 |
-
text: str
|
30 |
-
max_length: int = 512
|
31 |
-
temperature: float = 0.7
|
32 |
-
num_return_sequences: int = 1
|
33 |
-
|
34 |
-
class Response(BaseModel):
|
35 |
-
generated_text: List[str]
|
36 |
-
|
37 |
-
def train_model(dataset_path: str, num_epochs: int, batch_size: int, learning_rate: float):
|
38 |
-
global model, tokenizer
|
39 |
-
|
40 |
-
os.makedirs(model_output_path, exist_ok=True)
|
41 |
-
|
42 |
-
model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4"
|
43 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
44 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
|
45 |
-
|
46 |
-
dataset = load_dataset("json", data_files=dataset_path)
|
47 |
-
|
48 |
-
def preprocess_function(examples):
|
49 |
-
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
remove_columns=dataset["train"].column_names
|
55 |
-
)
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
gradient_accumulation_steps=4,
|
61 |
-
num_train_epochs=num_epochs,
|
62 |
-
learning_rate=learning_rate,
|
63 |
-
fp16=True,
|
64 |
-
save_steps=500,
|
65 |
-
logging_steps=100,
|
66 |
-
)
|
67 |
|
68 |
-
|
69 |
-
model=model,
|
70 |
-
args=training_args,
|
71 |
-
train_dataset=tokenized_dataset["train"],
|
72 |
-
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
|
73 |
-
)
|
74 |
-
|
75 |
-
# Start training
|
76 |
-
trainer.train()
|
77 |
-
|
78 |
-
# Save the final model and tokenizer
|
79 |
-
model.save_pretrained(model_output_path)
|
80 |
-
tokenizer.save_pretrained(model_output_path)
|
81 |
-
|
82 |
-
print(f"Model and tokenizer saved to: {model_output_path}")
|
83 |
-
|
84 |
-
@app.post("/train")
|
85 |
-
async def train(request: TrainRequest, background_tasks: BackgroundTasks):
|
86 |
-
background_tasks.add_task(train_model, request.dataset_path, request.num_epochs, request.batch_size, request.learning_rate)
|
87 |
-
return {"message": "Training started in the background"}
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
)
|
101 |
-
except Exception as e:
|
102 |
-
raise HTTPException(status_code=500, detail=f"Error loading model: {str(e)}")
|
103 |
|
|
|
|
|
104 |
try:
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
num_return_sequences=query.num_return_sequences,
|
118 |
-
temperature=query.temperature,
|
119 |
-
pad_token_id=tokenizer.pad_token_id,
|
120 |
-
eos_token_id=tokenizer.eos_token_id,
|
121 |
-
)
|
122 |
-
|
123 |
-
generated_texts = [
|
124 |
-
tokenizer.decode(g, skip_special_tokens=True)
|
125 |
-
for g in generated_ids
|
126 |
-
]
|
127 |
-
|
128 |
-
return Response(generated_text=generated_texts)
|
129 |
-
|
130 |
except Exception as e:
|
131 |
-
|
|
|
132 |
|
133 |
-
|
134 |
-
async def health_check():
|
135 |
-
return {"status": "healthy"}
|
136 |
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
|
|
4 |
import torch
|
5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
6 |
+
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
# Setup logging
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
11 |
|
12 |
+
# Setup cache directory
|
13 |
+
os.makedirs("/app/cache", exist_ok=True)
|
14 |
+
os.environ['TRANSFORMERS_CACHE'] = "/app/cache"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
app = FastAPI(title="Medical LLaMA API")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
# Add CORS middleware
|
19 |
+
app.add_middleware(
|
20 |
+
CORSMiddleware,
|
21 |
+
allow_origins=["*"],
|
22 |
+
allow_credentials=True,
|
23 |
+
allow_methods=["*"],
|
24 |
+
allow_headers=["*"],
|
25 |
+
)
|
26 |
|
27 |
+
# Check GPU availability
|
28 |
+
def check_gpu():
|
29 |
+
if torch.cuda.is_available():
|
30 |
+
logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
|
31 |
+
return True
|
32 |
+
logger.warning("No GPU available, using CPU")
|
33 |
+
return False
|
|
|
|
|
|
|
34 |
|
35 |
+
# Initialize model with proper device
|
36 |
+
def init_model():
|
37 |
try:
|
38 |
+
device = "cuda" if check_gpu() else "cpu"
|
39 |
+
model_path = os.getenv("MODEL_PATH", "./model/medical_llama_3b")
|
40 |
+
|
41 |
+
logger.info(f"Loading model from {model_path}")
|
42 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/app/cache")
|
43 |
+
model = AutoModelForCausalLM.from_pretrained(
|
44 |
+
model_path,
|
45 |
+
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
46 |
+
device_map="auto",
|
47 |
+
cache_dir="/app/cache"
|
48 |
+
)
|
49 |
+
return tokenizer, model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
except Exception as e:
|
51 |
+
logger.error(f"Error loading model: {str(e)}")
|
52 |
+
raise
|
53 |
|
54 |
+
# Rest of your existing code...
|
|
|
|
|
55 |
|
56 |
+
@app.on_event("startup")
|
57 |
+
async def startup_event():
|
58 |
+
logger.info("Starting up application...")
|
59 |
+
try:
|
60 |
+
global tokenizer, model
|
61 |
+
tokenizer, model = init_model()
|
62 |
+
logger.info("Model loaded successfully")
|
63 |
+
except Exception as e:
|
64 |
+
logger.error(f"Failed to load model: {str(e)}")
|
docker-compose.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
services:
|
3 |
+
app:
|
4 |
+
build: .
|
5 |
+
ports:
|
6 |
+
- "7860:7860"
|
7 |
+
volumes:
|
8 |
+
- ./model:/app/model
|
9 |
+
- ./cache:/app/cache
|
10 |
+
deploy:
|
11 |
+
resources:
|
12 |
+
reservations:
|
13 |
+
devices:
|
14 |
+
- driver: nvidia
|
15 |
+
count: 1
|
16 |
+
capabilities: [gpu]
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ datasets==2.14.5
|
|
6 |
pydantic==2.4.2
|
7 |
python-multipart==0.0.6
|
8 |
huggingface-hub==0.17.3
|
9 |
-
accelerate==0.24.1
|
|
|
|
6 |
pydantic==2.4.2
|
7 |
python-multipart==0.0.6
|
8 |
huggingface-hub==0.17.3
|
9 |
+
accelerate==0.24.1
|
10 |
+
numpy<2.0.0
|