Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

App Files Files Community

sdlc-agent / src /tools /api_endpoint_cpu.py

Veeru-c

initial commit

06bd253 23 days ago

raw

history blame contribute delete

2.46 kB

	import modal

	app = modal.App("census-qa-api-cpu")
	vol_checkpoints = modal.Volume.from_name("model-checkpoints")

	# CPU-only image (no CUDA)
	image = modal.Image.debian_slim(python_version="3.10") \
	.pip_install(
	"torch",
	"transformers",
	"peft",
	"accelerate",
	"bitsandbytes",
	"scipy",
	"huggingface_hub",
	"protobuf",
	"sentencepiece",
	"fastapi"
	)

	@app.cls(
	image=image,
	volumes={"/data/checkpoints": vol_checkpoints},
	cpu=4, # Use CPU instead of GPU
	memory=8192, # 8GB RAM
	keep_warm=1
	)
	class ModelCPU:
	@modal.enter()
	def load(self):
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	print("Loading model on CPU...")

	# Load base model
	base_model = "microsoft/Phi-3-mini-4k-instruct"
	self.tokenizer = AutoTokenizer.from_pretrained(base_model)

	# Load with PEFT adapter (no quantization on CPU)
	model = AutoModelForCausalLM.from_pretrained(
	base_model,
	torch_dtype="auto",
	device_map="cpu"
	)

	# Load LoRA adapter
	self.model = PeftModel.from_pretrained(
	model,
	"/data/checkpoints/phi3-census-lora"
	)

	print("Model loaded on CPU!")

	@modal.web_endpoint(method="POST")
	def ask(self, data: dict):
	prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{data.get('question', '')}

	### Input:
	{data.get('context', 'Context: Japan Census data.')}

	### Response:
	"""

	inputs = self.tokenizer([prompt], return_tensors="pt")
	outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.1)
	response = self.tokenizer.batch_decode(outputs)[0]

	if "### Response:\n" in response:
	answer = response.split("### Response:\n")[1].split("<\|endoftext\|>")[0].strip()
	else:
	answer = response.strip()

	return {"question": data.get('question'), "answer": answer}

	@app.local_entrypoint()
	def main():
	print("CPU-based API endpoint")
	print("Deploy with: modal deploy docs/api_endpoint_cpu.py")
	print("Note: CPU inference is 10-20x slower than GPU")