medical-coding-api / app /model_loader.py
Distopia22's picture
Fix: Add robust model loading with safetensors fallback strategies
61e7d9a
# model_loader.py
import os
import sys
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
AutoConfig,
pipeline
)
import torch
import warnings
warnings.filterwarnings("ignore")
MODEL_NAME = "RayyanAhmed9477/med-coding"
def load_model_and_tokenizer():
"""
Loads Phi-3 model with multiple fallback strategies.
Handles safetensors loading issues with robust error recovery.
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ”§ Using device: {device}")
print(f"πŸ”§ PyTorch version: {torch.__version__}")
print(f"πŸ”§ Transformers version: {sys.modules['transformers'].__version__}")
# Get HuggingFace token from environment
hf_token = os.getenv("HF_TOKEN")
if hf_token:
print("πŸ”‘ HuggingFace token found")
else:
print("⚠️ No HuggingFace token - assuming public model")
try:
# ===== STEP 1: Load Tokenizer =====
print(f"πŸ“₯ Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
token=hf_token,
use_fast=True
)
# Configure tokenizer
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
tokenizer.padding_side = "left"
print("βœ… Tokenizer loaded successfully")
# ===== STEP 2: Load Configuration =====
print(f"πŸ“₯ Loading model configuration: {MODEL_NAME}")
config = AutoConfig.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
token=hf_token
)
# Handle LongRoPE configuration
if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
rope_type = config.rope_scaling.get('type', 'default')
print(f"πŸ“ RoPE scaling type detected: {rope_type}")
if rope_type == 'longrope':
print("βœ… LongRoPE configuration detected and supported")
print(f"βœ… Config loaded: {config.model_type}")
# ===== STEP 3: Load Model with Multiple Strategies =====
print(f"πŸ“₯ Loading model: {MODEL_NAME}")
print("⏳ This may take 2-5 minutes on first load...")
model = None
loading_strategies = []
if device == "cuda":
loading_strategies = [
# Strategy 1: Standard GPU loading
{
"name": "GPU Standard",
"params": {
"trust_remote_code": True,
"torch_dtype": torch.bfloat16,
"device_map": "auto",
"token": hf_token,
"low_cpu_mem_usage": True
}
}
]
else:
loading_strategies = [
# Strategy 1: CPU with safetensors (preferred)
{
"name": "CPU with safetensors",
"params": {
"trust_remote_code": True,
"torch_dtype": torch.float32,
"device_map": {"": "cpu"},
"token": hf_token,
"low_cpu_mem_usage": True,
"use_safetensors": True
}
},
# Strategy 2: CPU without explicit safetensors
{
"name": "CPU standard",
"params": {
"trust_remote_code": True,
"torch_dtype": torch.float32,
"token": hf_token,
"low_cpu_mem_usage": True
}
},
# Strategy 3: CPU with PyTorch weights fallback
{
"name": "CPU PyTorch weights",
"params": {
"trust_remote_code": True,
"torch_dtype": torch.float32,
"token": hf_token,
"low_cpu_mem_usage": True,
"use_safetensors": False
}
},
# Strategy 4: Minimal parameters
{
"name": "CPU minimal",
"params": {
"trust_remote_code": True,
"token": hf_token
}
}
]
# Try each loading strategy
for idx, strategy in enumerate(loading_strategies, 1):
try:
print(f"\nπŸ”„ Attempt {idx}/{len(loading_strategies)}: {strategy['name']}")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
config=config,
**strategy['params']
)
# Move to CPU explicitly if needed
if device == "cpu" and not strategy['params'].get('device_map'):
model = model.to("cpu")
print(f"βœ… Model loaded successfully using: {strategy['name']}")
break
except Exception as e:
print(f"❌ Strategy '{strategy['name']}' failed: {str(e)}")
if idx == len(loading_strategies):
# All strategies failed
raise
else:
print(f"⏭️ Trying next strategy...")
continue
if model is None:
raise RuntimeError("All loading strategies failed")
# Set model to evaluation mode
model.eval()
# Disable gradients to save memory
for param in model.parameters():
param.requires_grad = False
print("\nβœ… Model fully loaded and ready!")
# ===== STEP 4: Create Pipeline =====
print("πŸ”§ Creating text generation pipeline...")
gen_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if device == "cuda" else -1,
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
)
print("βœ… Pipeline created successfully!")
print("=" * 60)
print("πŸŽ‰ MODEL READY FOR INFERENCE")
print("=" * 60)
return gen_pipeline, tokenizer
except Exception as e:
print(f"\n❌ Error during model loading: {str(e)}")
print("\nπŸ” Diagnostic Information:")
print(f" - Model: {MODEL_NAME}")
print(f" - Device: {device}")
print(f" - Token available: {hf_token is not None}")
import traceback
traceback.print_exc()
raise RuntimeError(
f"Failed to load model {MODEL_NAME}. "
"All loading strategies exhausted. "
"This could be due to: "
"1) Model file corruption during download, "
"2) Insufficient memory, "
"3) Model incompatibility. "
"Try upgrading Space to GPU or use a different model."
) from e