Spaces:

fullstuckdev
/

medication-ai-model

Build error

App Files Files Community

fullstuckdev commited on Nov 27, 2024

Commit

e7ceaff

1 Parent(s): 2e7d4a0

fixing

Browse files

Files changed (5) hide show

Dockerfile +24 -11
README.md +18 -7
app.py +51 -125
docker-compose.yml +16 -0
requirements.txt +2 -1

Dockerfile CHANGED Viewed

@@ -1,33 +1,46 @@
-# Use NVIDIA CUDA base image
-FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
-    python3 \
     python3-pip \
     git \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first to leverage Docker cache
 COPY requirements.txt .
-# Install Python dependencies
-RUN pip3 install --no-cache-dir -r requirements.txt
 # Copy the rest of the application
 COPY . .
-# Create model directory
-RUN mkdir -p /app/model/medical_llama_3b
 # Expose port
 EXPOSE 7860
-# Set environment variables
-ENV MODEL_PATH=/app/model/medical_llama_3b
 # Command to run the application
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use NVIDIA CUDA base image with Python
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TRANSFORMERS_CACHE=/app/cache
+ENV PYTHONUNBUFFERED=1
+ENV PORT=7860
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
+    python3.10 \
     python3-pip \
     git \
     && rm -rf /var/lib/apt/lists/*
+# Create cache directory and set permissions
+RUN mkdir -p /app/cache && \
+    mkdir -p /app/model/medical_llama_3b && \
+    chmod -R 777 /app/cache
 # Copy requirements first to leverage Docker cache
 COPY requirements.txt .
+# Update pip and install dependencies
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Install specific numpy version to fix compatibility
+RUN pip install --no-cache-dir "numpy<2.0.0"
 # Copy the rest of the application
 COPY . .
 # Expose port
 EXPOSE 7860
+# Set environment variables for GPU
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 # Command to run the application
+CMD ["python3", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,23 @@
 ---
-title: Medication Ai Model
-emoji: 📈
-colorFrom: yellow
-colorTo: indigo
 sdk: docker
 pinned: false
-license: apache-2.0
-app_port: 8000
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Medical LLaMA API
+emoji: 🏥
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
+nvidia: true
 ---
+# Medical LLaMA API
+This is a FastAPI application serving a fine-tuned LLaMA model for medical queries.
+## Hardware Requirements
+- GPU: Required
+- Memory: 16GB minimum
+- Storage: 30GB minimum
+## Environment Variables
+- `MODEL_PATH`: Path to the model files
+- `TRANSFORMERS_CACHE`: Cache directory for Hugging Face

app.py CHANGED Viewed

@@ -1,138 +1,64 @@
 import os
 from fastapi import FastAPI, HTTPException, BackgroundTasks
-from pydantic import BaseModel
-from typing import List, Optional
 import torch
-from datasets import load_dataset
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    TrainingArguments,
-    Trainer,
-    DataCollatorForLanguageModeling
-)
-import uvicorn
-app = FastAPI(title="Medical LLaMA API")
-model = None
-tokenizer = None
-model_output_path = "./model/medical_llama_3b"
-class TrainRequest(BaseModel):
-    dataset_path: str
-    num_epochs: int = 3
-    batch_size: int = 4
-    learning_rate: float = 2e-5
-class Query(BaseModel):
-    text: str
-    max_length: int = 512
-    temperature: float = 0.7
-    num_return_sequences: int = 1
-class Response(BaseModel):
-    generated_text: List[str]
-def train_model(dataset_path: str, num_epochs: int, batch_size: int, learning_rate: float):
-    global model, tokenizer
-    os.makedirs(model_output_path, exist_ok=True)
-    model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-    dataset = load_dataset("json", data_files=dataset_path)
-    def preprocess_function(examples):
-        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
-    tokenized_dataset = dataset.map(
-        preprocess_function,
-        batched=True,
-        remove_columns=dataset["train"].column_names
-    )
-    training_args = TrainingArguments(
-        output_dir=f"{model_output_path}/checkpoints",
-        per_device_train_batch_size=batch_size,
-        gradient_accumulation_steps=4,
-        num_train_epochs=num_epochs,
-        learning_rate=learning_rate,
-        fp16=True,
-        save_steps=500,
-        logging_steps=100,
-    )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_dataset["train"],
-        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
-    )
-    # Start training
-    trainer.train()
-    # Save the final model and tokenizer
-    model.save_pretrained(model_output_path)
-    tokenizer.save_pretrained(model_output_path)
-    print(f"Model and tokenizer saved to: {model_output_path}")
-@app.post("/train")
-async def train(request: TrainRequest, background_tasks: BackgroundTasks):
-    background_tasks.add_task(train_model, request.dataset_path, request.num_epochs, request.batch_size, request.learning_rate)
-    return {"message": "Training started in the background"}
-@app.post("/generate", response_model=Response)
-async def generate_text(query: Query):
-    global model, tokenizer
-    if model is None or tokenizer is None:
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_output_path)
-            model = AutoModelForCausalLM.from_pretrained(
-                model_output_path,
-                torch_dtype=torch.float16,
-                device_map="auto"
-            )
-        except Exception as e:
-            raise HTTPException(status_code=500, detail=f"Error loading model: {str(e)}")
     try:
-        inputs = tokenizer(
-            query.text,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=query.max_length
-        ).to(model.device)
-        with torch.no_grad():
-            generated_ids = model.generate(
-                inputs.input_ids,
-                max_length=query.max_length,
-                num_return_sequences=query.num_return_sequences,
-                temperature=query.temperature,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-            )
-        generated_texts = [
-            tokenizer.decode(g, skip_special_tokens=True)
-            for g in generated_ids
-        ]
-        return Response(generated_text=generated_texts)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy"}
-if __name__ == "__main__":
-    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

 import os
 from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Setup cache directory
+os.makedirs("/app/cache", exist_ok=True)
+os.environ['TRANSFORMERS_CACHE'] = "/app/cache"
+app = FastAPI(title="Medical LLaMA API")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Check GPU availability
+def check_gpu():
+    if torch.cuda.is_available():
+        logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
+        return True
+    logger.warning("No GPU available, using CPU")
+    return False
+# Initialize model with proper device
+def init_model():
     try:
+        device = "cuda" if check_gpu() else "cpu"
+        model_path = os.getenv("MODEL_PATH", "./model/medical_llama_3b")
+        logger.info(f"Loading model from {model_path}")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/app/cache")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            device_map="auto",
+            cache_dir="/app/cache"
+        )
+        return tokenizer, model
     except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        raise
+# Rest of your existing code...
+@app.on_event("startup")
+async def startup_event():
+    logger.info("Starting up application...")
+    try:
+        global tokenizer, model
+        tokenizer, model = init_model()
+        logger.info("Model loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to load model: {str(e)}")

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+version: '3.8'
+services:
+  app:
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./model:/app/model
+      - ./cache:/app/cache
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ datasets==2.14.5
 pydantic==2.4.2
 python-multipart==0.0.6
 huggingface-hub==0.17.3
-accelerate==0.24.1

 pydantic==2.4.2
 python-multipart==0.0.6
 huggingface-hub==0.17.3
+accelerate==0.24.1
+numpy<2.0.0