fullstuckdev commited on
Commit
e7ceaff
1 Parent(s): 2e7d4a0
Files changed (5) hide show
  1. Dockerfile +24 -11
  2. README.md +18 -7
  3. app.py +51 -125
  4. docker-compose.yml +16 -0
  5. requirements.txt +2 -1
Dockerfile CHANGED
@@ -1,33 +1,46 @@
1
- # Use NVIDIA CUDA base image
2
- FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
 
 
 
 
 
 
3
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
  # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
- python3 \
10
  python3-pip \
11
  git \
12
  && rm -rf /var/lib/apt/lists/*
13
 
 
 
 
 
 
14
  # Copy requirements first to leverage Docker cache
15
  COPY requirements.txt .
16
 
17
- # Install Python dependencies
18
- RUN pip3 install --no-cache-dir -r requirements.txt
 
 
 
 
19
 
20
  # Copy the rest of the application
21
  COPY . .
22
 
23
- # Create model directory
24
- RUN mkdir -p /app/model/medical_llama_3b
25
-
26
  # Expose port
27
  EXPOSE 7860
28
 
29
- # Set environment variables
30
- ENV MODEL_PATH=/app/model/medical_llama_3b
 
31
 
32
  # Command to run the application
33
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Use NVIDIA CUDA base image with Python
2
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
+
4
+ # Set environment variables
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+ ENV TRANSFORMERS_CACHE=/app/cache
7
+ ENV PYTHONUNBUFFERED=1
8
+ ENV PORT=7860
9
 
10
  # Set working directory
11
  WORKDIR /app
12
 
13
  # Install system dependencies
14
  RUN apt-get update && apt-get install -y \
15
+ python3.10 \
16
  python3-pip \
17
  git \
18
  && rm -rf /var/lib/apt/lists/*
19
 
20
+ # Create cache directory and set permissions
21
+ RUN mkdir -p /app/cache && \
22
+ mkdir -p /app/model/medical_llama_3b && \
23
+ chmod -R 777 /app/cache
24
+
25
  # Copy requirements first to leverage Docker cache
26
  COPY requirements.txt .
27
 
28
+ # Update pip and install dependencies
29
+ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
30
+ pip install --no-cache-dir -r requirements.txt
31
+
32
+ # Install specific numpy version to fix compatibility
33
+ RUN pip install --no-cache-dir "numpy<2.0.0"
34
 
35
  # Copy the rest of the application
36
  COPY . .
37
 
 
 
 
38
  # Expose port
39
  EXPOSE 7860
40
 
41
+ # Set environment variables for GPU
42
+ ENV NVIDIA_VISIBLE_DEVICES=all
43
+ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
44
 
45
  # Command to run the application
46
+ CMD ["python3", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,23 @@
1
  ---
2
- title: Medication Ai Model
3
- emoji: 📈
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: docker
 
7
  pinned: false
8
- license: apache-2.0
9
- app_port: 8000
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Medical LLaMA API
3
+ emoji: 🏥
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
+ nvidia: true
 
10
  ---
11
 
12
+ # Medical LLaMA API
13
+
14
+ This is a FastAPI application serving a fine-tuned LLaMA model for medical queries.
15
+
16
+ ## Hardware Requirements
17
+ - GPU: Required
18
+ - Memory: 16GB minimum
19
+ - Storage: 30GB minimum
20
+
21
+ ## Environment Variables
22
+ - `MODEL_PATH`: Path to the model files
23
+ - `TRANSFORMERS_CACHE`: Cache directory for Hugging Face
app.py CHANGED
@@ -1,138 +1,64 @@
1
  import os
2
  from fastapi import FastAPI, HTTPException, BackgroundTasks
3
- from pydantic import BaseModel
4
- from typing import List, Optional
5
  import torch
6
- from datasets import load_dataset
7
- from transformers import (
8
- AutoTokenizer,
9
- AutoModelForCausalLM,
10
- TrainingArguments,
11
- Trainer,
12
- DataCollatorForLanguageModeling
13
- )
14
- import uvicorn
15
-
16
- app = FastAPI(title="Medical LLaMA API")
17
-
18
- model = None
19
- tokenizer = None
20
- model_output_path = "./model/medical_llama_3b"
21
-
22
- class TrainRequest(BaseModel):
23
- dataset_path: str
24
- num_epochs: int = 3
25
- batch_size: int = 4
26
- learning_rate: float = 2e-5
27
-
28
- class Query(BaseModel):
29
- text: str
30
- max_length: int = 512
31
- temperature: float = 0.7
32
- num_return_sequences: int = 1
33
-
34
- class Response(BaseModel):
35
- generated_text: List[str]
36
-
37
- def train_model(dataset_path: str, num_epochs: int, batch_size: int, learning_rate: float):
38
- global model, tokenizer
39
-
40
- os.makedirs(model_output_path, exist_ok=True)
41
-
42
- model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4"
43
- tokenizer = AutoTokenizer.from_pretrained(model_name)
44
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
45
-
46
- dataset = load_dataset("json", data_files=dataset_path)
47
-
48
- def preprocess_function(examples):
49
- return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
50
 
51
- tokenized_dataset = dataset.map(
52
- preprocess_function,
53
- batched=True,
54
- remove_columns=dataset["train"].column_names
55
- )
56
 
57
- training_args = TrainingArguments(
58
- output_dir=f"{model_output_path}/checkpoints",
59
- per_device_train_batch_size=batch_size,
60
- gradient_accumulation_steps=4,
61
- num_train_epochs=num_epochs,
62
- learning_rate=learning_rate,
63
- fp16=True,
64
- save_steps=500,
65
- logging_steps=100,
66
- )
67
 
68
- trainer = Trainer(
69
- model=model,
70
- args=training_args,
71
- train_dataset=tokenized_dataset["train"],
72
- data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
73
- )
74
-
75
- # Start training
76
- trainer.train()
77
-
78
- # Save the final model and tokenizer
79
- model.save_pretrained(model_output_path)
80
- tokenizer.save_pretrained(model_output_path)
81
-
82
- print(f"Model and tokenizer saved to: {model_output_path}")
83
-
84
- @app.post("/train")
85
- async def train(request: TrainRequest, background_tasks: BackgroundTasks):
86
- background_tasks.add_task(train_model, request.dataset_path, request.num_epochs, request.batch_size, request.learning_rate)
87
- return {"message": "Training started in the background"}
88
 
89
- @app.post("/generate", response_model=Response)
90
- async def generate_text(query: Query):
91
- global model, tokenizer
 
 
 
 
 
92
 
93
- if model is None or tokenizer is None:
94
- try:
95
- tokenizer = AutoTokenizer.from_pretrained(model_output_path)
96
- model = AutoModelForCausalLM.from_pretrained(
97
- model_output_path,
98
- torch_dtype=torch.float16,
99
- device_map="auto"
100
- )
101
- except Exception as e:
102
- raise HTTPException(status_code=500, detail=f"Error loading model: {str(e)}")
103
 
 
 
104
  try:
105
- inputs = tokenizer(
106
- query.text,
107
- return_tensors="pt",
108
- padding=True,
109
- truncation=True,
110
- max_length=query.max_length
111
- ).to(model.device)
112
-
113
- with torch.no_grad():
114
- generated_ids = model.generate(
115
- inputs.input_ids,
116
- max_length=query.max_length,
117
- num_return_sequences=query.num_return_sequences,
118
- temperature=query.temperature,
119
- pad_token_id=tokenizer.pad_token_id,
120
- eos_token_id=tokenizer.eos_token_id,
121
- )
122
-
123
- generated_texts = [
124
- tokenizer.decode(g, skip_special_tokens=True)
125
- for g in generated_ids
126
- ]
127
-
128
- return Response(generated_text=generated_texts)
129
-
130
  except Exception as e:
131
- raise HTTPException(status_code=500, detail=str(e))
 
132
 
133
- @app.get("/health")
134
- async def health_check():
135
- return {"status": "healthy"}
136
 
137
- if __name__ == "__main__":
138
- uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)
 
 
 
 
 
 
 
 
1
  import os
2
  from fastapi import FastAPI, HTTPException, BackgroundTasks
3
+ from fastapi.middleware.cors import CORSMiddleware
 
4
  import torch
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Setup logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
 
 
11
 
12
+ # Setup cache directory
13
+ os.makedirs("/app/cache", exist_ok=True)
14
+ os.environ['TRANSFORMERS_CACHE'] = "/app/cache"
 
 
 
 
 
 
 
15
 
16
+ app = FastAPI(title="Medical LLaMA API")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Add CORS middleware
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=["*"],
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
 
27
+ # Check GPU availability
28
+ def check_gpu():
29
+ if torch.cuda.is_available():
30
+ logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
31
+ return True
32
+ logger.warning("No GPU available, using CPU")
33
+ return False
 
 
 
34
 
35
+ # Initialize model with proper device
36
+ def init_model():
37
  try:
38
+ device = "cuda" if check_gpu() else "cpu"
39
+ model_path = os.getenv("MODEL_PATH", "./model/medical_llama_3b")
40
+
41
+ logger.info(f"Loading model from {model_path}")
42
+ tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/app/cache")
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ model_path,
45
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
46
+ device_map="auto",
47
+ cache_dir="/app/cache"
48
+ )
49
+ return tokenizer, model
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  except Exception as e:
51
+ logger.error(f"Error loading model: {str(e)}")
52
+ raise
53
 
54
+ # Rest of your existing code...
 
 
55
 
56
+ @app.on_event("startup")
57
+ async def startup_event():
58
+ logger.info("Starting up application...")
59
+ try:
60
+ global tokenizer, model
61
+ tokenizer, model = init_model()
62
+ logger.info("Model loaded successfully")
63
+ except Exception as e:
64
+ logger.error(f"Failed to load model: {str(e)}")
docker-compose.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ app:
4
+ build: .
5
+ ports:
6
+ - "7860:7860"
7
+ volumes:
8
+ - ./model:/app/model
9
+ - ./cache:/app/cache
10
+ deploy:
11
+ resources:
12
+ reservations:
13
+ devices:
14
+ - driver: nvidia
15
+ count: 1
16
+ capabilities: [gpu]
requirements.txt CHANGED
@@ -6,4 +6,5 @@ datasets==2.14.5
6
  pydantic==2.4.2
7
  python-multipart==0.0.6
8
  huggingface-hub==0.17.3
9
- accelerate==0.24.1
 
 
6
  pydantic==2.4.2
7
  python-multipart==0.0.6
8
  huggingface-hub==0.17.3
9
+ accelerate==0.24.1
10
+ numpy<2.0.0