Spaces:
Runtime error
Runtime error
| import modal | |
| app = modal.App("census-qa-api-cpu") | |
| vol_checkpoints = modal.Volume.from_name("model-checkpoints") | |
| # CPU-only image (no CUDA) | |
| image = modal.Image.debian_slim(python_version="3.10") \ | |
| .pip_install( | |
| "torch", | |
| "transformers", | |
| "peft", | |
| "accelerate", | |
| "bitsandbytes", | |
| "scipy", | |
| "huggingface_hub", | |
| "protobuf", | |
| "sentencepiece", | |
| "fastapi" | |
| ) | |
| class ModelCPU: | |
| def load(self): | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| print("Loading model on CPU...") | |
| # Load base model | |
| base_model = "microsoft/Phi-3-mini-4k-instruct" | |
| self.tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| # Load with PEFT adapter (no quantization on CPU) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model, | |
| torch_dtype="auto", | |
| device_map="cpu" | |
| ) | |
| # Load LoRA adapter | |
| self.model = PeftModel.from_pretrained( | |
| model, | |
| "/data/checkpoints/phi3-census-lora" | |
| ) | |
| print("Model loaded on CPU!") | |
| def ask(self, data: dict): | |
| prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {data.get('question', '')} | |
| ### Input: | |
| {data.get('context', 'Context: Japan Census data.')} | |
| ### Response: | |
| """ | |
| inputs = self.tokenizer([prompt], return_tensors="pt") | |
| outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.1) | |
| response = self.tokenizer.batch_decode(outputs)[0] | |
| if "### Response:\n" in response: | |
| answer = response.split("### Response:\n")[1].split("<|endoftext|>")[0].strip() | |
| else: | |
| answer = response.strip() | |
| return {"question": data.get('question'), "answer": answer} | |
| def main(): | |
| print("CPU-based API endpoint") | |
| print("Deploy with: modal deploy docs/api_endpoint_cpu.py") | |
| print("Note: CPU inference is 10-20x slower than GPU") | |