Spaces:

lemms
/

llm

Sleeping

App Files Files Community

llm / inference_server.py

lemms

Upload inference_server.py with huggingface_hub

2c121ea verified 15 days ago

raw

history blame contribute delete

31.3 kB

	#!/usr/bin/env python3
	# Copyright (C) 2024 Louis Chua Bean Chong
	#
	# This file is part of OpenLLM.
	#
	# OpenLLM is dual-licensed:
	# 1. For open source use: GNU General Public License v3.0
	# 2. For commercial use: Commercial License (contact for details)
	#
	# See LICENSE and docs/LICENSES.md for full license information.

	"""
	OpenLLM Inference Server

	This script implements the REST API server for OpenLLM model inference
	as specified in Step 6 of the training pipeline.

	Features:
	- FastAPI-based REST API
	- Support for multiple model formats (PyTorch, Hugging Face, ONNX)
	- Text generation with configurable parameters
	- Health checks and metrics
	- Production-ready deployment

	Usage:
	python core/src/inference_server.py \
	--model_path exports/huggingface/ \
	--host 0.0.0.0 \
	--port 8000 \
	--max_length 512

	API Endpoints:
	POST /generate - Generate text from prompt
	GET /health - Health check
	GET /info - Model information

	Author: Louis Chua Bean Chong
	License: GPLv3
	"""

	import argparse
	import json
	import time
	from pathlib import Path
	from typing import Any, Dict, List, Optional

	import uvicorn

	# FastAPI imports (open source)
	try:
	from fastapi import BackgroundTasks, FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel, Field
	except ImportError:
	raise ImportError("Install FastAPI: pip install fastapi uvicorn[standard]")

	import os

	# Import our modules
	import sys

	import numpy as np
	import sentencepiece as smp
	import torch

	# Add current directory to path for imports
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from model import create_model


	class TextGenerationConfig(BaseModel):
	"""Configuration for text generation parameters."""

	max_new_tokens: int = Field(
	256, description="Maximum number of tokens to generate", ge=1, le=2048
	)
	temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
	top_k: Optional[int] = Field(40, description="Top-k sampling parameter", ge=1, le=1000)
	top_p: Optional[float] = Field(0.9, description="Nucleus sampling parameter", ge=0.1, le=1.0)
	num_return_sequences: int = Field(1, description="Number of sequences to generate", ge=1, le=5)
	stop_sequences: Optional[List[str]] = Field(
	None, description="Stop generation at these sequences"
	)


	class GenerationRequest(BaseModel):
	"""Request model for text generation."""

	prompt: str = Field(..., description="Input text prompt")
	max_length: int = Field(256, description="Maximum generation length", ge=1, le=2048)
	temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
	top_k: Optional[int] = Field(40, description="Top-k sampling parameter", ge=1, le=1000)
	top_p: Optional[float] = Field(0.9, description="Nucleus sampling parameter", ge=0.1, le=1.0)
	num_return_sequences: int = Field(1, description="Number of sequences to generate", ge=1, le=5)
	stop_sequences: Optional[List[str]] = Field(
	None, description="Stop generation at these sequences"
	)


	class GenerationResponse(BaseModel):
	"""Response model for text generation."""

	generated_text: List[str] = Field(..., description="Generated text sequences")
	prompt: str = Field(..., description="Original prompt")
	generation_time: float = Field(..., description="Generation time in seconds")
	parameters: Dict[str, Any] = Field(..., description="Generation parameters used")


	class ModelInfo(BaseModel):
	"""Model information response."""

	model_name: str
	model_size: str
	parameters: int
	vocab_size: int
	max_length: int
	format: str
	loaded_at: str


	class HealthResponse(BaseModel):
	"""Health check response."""

	status: str
	model_loaded: bool
	uptime_seconds: float
	total_requests: int


	class OpenLLMInference:
	"""
	OpenLLM model inference engine.

	Supports multiple model formats and provides text generation capabilities.
	"""

	def __init__(self, model_path: str, model_format: str = "auto"):
	"""
	Initialize inference engine.

	Args:
	model_path: Path to exported model directory
	model_format: Model format (pytorch, huggingface, onnx, auto)
	"""
	self.model_path = Path(model_path)
	self.model_format = model_format
	self.model = None
	self.tokenizer = None
	self.config = None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load model
	self._load_model()

	# Statistics
	self.loaded_at = time.time()
	self.total_requests = 0

	print("🚀 OpenLLM Inference Engine initialized")
	print(f" Model: {self.config.get('model_name', 'Unknown')}")
	print(f" Format: {self.detected_format}")
	print(f" Device: {self.device}")

	def _detect_format(self) -> str:
	"""Auto-detect model format from directory contents."""
	if (self.model_path / "model.pt").exists():
	return "pytorch"
	elif (self.model_path / "pytorch_model.bin").exists():
	return "huggingface"
	elif (self.model_path / "model.onnx").exists():
	return "onnx"
	else:
	raise ValueError(f"Could not detect model format in {self.model_path}")

	def _load_model(self):
	"""Load model based on detected format."""
	if self.model_format == "auto":
	self.detected_format = self._detect_format()
	else:
	self.detected_format = self.model_format

	print(f"📂 Loading {self.detected_format} model from {self.model_path}")

	if self.detected_format == "pytorch":
	self._load_pytorch_model()
	elif self.detected_format == "huggingface":
	self._load_huggingface_model()
	elif self.detected_format == "onnx":
	self._load_onnx_model()
	else:
	raise ValueError(f"Unsupported format: {self.detected_format}")

	# Load tokenizer
	self._load_tokenizer()

	print("✅ Model loaded successfully")

	def _load_pytorch_model(self):
	"""Load PyTorch format model."""
	# Load config
	with open(self.model_path / "config.json", "r") as f:
	config_data = json.load(f)

	self.config = config_data["model_config"]

	# Load model
	checkpoint = torch.load(self.model_path / "model.pt", map_location=self.device)

	# Determine model size
	n_layer = self.config.get("n_layer", 12)
	if n_layer <= 6:
	model_size = "small"
	elif n_layer <= 12:
	model_size = "medium"
	else:
	model_size = "large"

	# Create model
	self.model = create_model(model_size)
	self.model.load_state_dict(checkpoint["model_state_dict"])
	self.model.to(self.device)
	self.model.eval()

	def _load_huggingface_model(self):
	"""Load Hugging Face format model."""
	# Load config
	with open(self.model_path / "config.json", "r") as f:
	self.config = json.load(f)

	# Load model weights
	state_dict = torch.load(self.model_path / "pytorch_model.bin", map_location=self.device)

	# Determine model size
	n_layer = self.config.get("n_layer", 12)
	if n_layer <= 6:
	model_size = "small"
	elif n_layer <= 12:
	model_size = "medium"
	else:
	model_size = "large"

	# Create model
	self.model = create_model(model_size)
	self.model.load_state_dict(state_dict)
	self.model.to(self.device)
	self.model.eval()

	def _load_onnx_model(self):
	"""Load ONNX format model."""
	try:
	import onnxruntime as ort
	except ImportError:
	raise ImportError("ONNX inference requires: pip install onnxruntime")

	# Security mitigation: Validate model path to prevent arbitrary file access
	model_file = self.model_path / "model.onnx"
	if not model_file.exists():
	raise FileNotFoundError(f"ONNX model not found: {model_file}")

	# Security mitigation: Validate file is within expected directory
	if not str(model_file).startswith(str(self.model_path)):
	raise ValueError(f"Invalid model path: {model_file}")

	# Load metadata with path validation
	metadata_file = self.model_path / "metadata.json"
	if not metadata_file.exists():
	raise FileNotFoundError(f"ONNX metadata not found: {metadata_file}")

	with open(metadata_file, "r") as f:
	metadata = json.load(f)

	self.config = metadata["model_config"]

	# Create ONNX session with security options
	providers = (
	["CUDAExecutionProvider", "CPUExecutionProvider"]
	if torch.cuda.is_available()
	else ["CPUExecutionProvider"]
	)

	# Security mitigation: Use session options to restrict capabilities
	session_options = ort.SessionOptions()
	session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
	session_options.enable_mem_pattern = False # Disable memory optimization
	session_options.enable_cpu_mem_arena = False # Disable CPU memory arena

	self.onnx_session = ort.InferenceSession(
	str(model_file), providers=providers, sess_options=session_options
	)

	# ONNX models don't need device management
	self.device = "onnx"

	def _load_tokenizer(self):
	"""Load tokenizer."""
	tokenizer_path = self.model_path / "tokenizer.model"
	if not tokenizer_path.exists():
	raise FileNotFoundError(f"Tokenizer not found: {tokenizer_path}")

	self.tokenizer = smp.SentencePieceProcessor()
	self.tokenizer.load(str(tokenizer_path))

	def generate(
	self,
	prompt: str,
	max_length: int = 256,
	temperature: float = 0.7,
	top_k: Optional[int] = 40,
	top_p: Optional[float] = 0.9,
	num_return_sequences: int = 1,
	stop_sequences: Optional[List[str]] = None,
	) -> List[str]:
	"""
	Generate text from prompt.

	Args:
	prompt: Input text prompt
	max_length: Maximum generation length
	temperature: Sampling temperature
	top_k: Top-k sampling parameter
	top_p: Nucleus sampling parameter
	num_return_sequences: Number of sequences to generate
	stop_sequences: Stop generation at these sequences

	Returns:
	List of generated text sequences
	"""
	self.total_requests += 1

	if self.detected_format == "onnx":
	return self._generate_onnx(
	prompt, max_length, temperature, top_k, num_return_sequences, stop_sequences
	)
	else:
	return self._generate_pytorch(
	prompt, max_length, temperature, top_k, top_p, num_return_sequences, stop_sequences
	)

	def _generate_pytorch(
	self,
	prompt: str,
	max_length: int,
	temperature: float,
	top_k: Optional[int],
	top_p: Optional[float],
	num_return_sequences: int,
	stop_sequences: Optional[List[str]],
	) -> List[str]:
	"""Generate using PyTorch model."""
	# Tokenize prompt
	input_ids = self.tokenizer.encode(prompt)
	input_tensor = torch.tensor(
	[input_ids] * num_return_sequences, dtype=torch.long, device=self.device
	)

	# Generate
	with torch.no_grad():
	outputs = []
	for _ in range(num_return_sequences):
	# Use model's generate method if available
	if hasattr(self.model, "generate"):
	output = self.model.generate(
	input_tensor[:1], # Single sequence
	max_new_tokens=max_length,
	temperature=temperature,
	top_k=top_k,
	)
	generated_ids = output[0].tolist()
	generated_text = self.tokenizer.decode(generated_ids[len(input_ids) :])
	else:
	# Fallback simple generation
	generated_text = self._simple_generate(
	input_tensor[:1], max_length, temperature
	)

	# Apply stop sequences
	if stop_sequences:
	for stop_seq in stop_sequences:
	if stop_seq in generated_text:
	generated_text = generated_text.split(stop_seq)[0]
	break

	outputs.append(generated_text)

	return outputs

	def _generate_onnx(
	self,
	prompt: str,
	max_length: int,
	temperature: float,
	top_k: Optional[int],
	num_return_sequences: int,
	stop_sequences: Optional[List[str]],
	) -> List[str]:
	"""Generate using ONNX model."""
	outputs = []

	for _ in range(num_return_sequences):
	# Tokenize prompt
	tokens = self.tokenizer.encode(prompt)
	generated = tokens.copy()

	# Simple autoregressive generation
	for _ in range(max_length):
	if len(generated) >= 512: # Max sequence length for ONNX
	break

	# Prepare input (last 64 tokens to fit ONNX model)
	current_input = np.array([generated[-64:]], dtype=np.int64)

	# Run inference
	logits = self.onnx_session.run(None, {"input_ids": current_input})[0]
	next_token_logits = logits[0, -1, :]

	# Apply temperature
	if temperature > 0:
	next_token_logits = next_token_logits / temperature
	probs = np.exp(next_token_logits) / np.sum(np.exp(next_token_logits))

	# Apply top-k if specified
	if top_k:
	top_indices = np.argpartition(probs, -top_k)[-top_k:]
	probs_filtered = np.zeros_like(probs)
	probs_filtered[top_indices] = probs[top_indices]
	probs = probs_filtered / np.sum(probs_filtered)

	next_token = np.random.choice(len(probs), p=probs)
	else:
	next_token = np.argmax(next_token_logits)

	generated.append(int(next_token))

	# Decode generated text
	generated_text = self.tokenizer.decode(generated[len(tokens) :])

	# Apply stop sequences
	if stop_sequences:
	for stop_seq in stop_sequences:
	if stop_seq in generated_text:
	generated_text = generated_text.split(stop_seq)[0]
	break

	outputs.append(generated_text)

	return outputs

	def _simple_generate(
	self, input_tensor: torch.Tensor, max_length: int, temperature: float
	) -> str:
	"""Simple fallback generation method."""
	generated = input_tensor[0].tolist()

	for _ in range(max_length):
	if len(generated) >= self.config.get("block_size", 1024):
	break

	# Forward pass
	current_input = torch.tensor([generated], dtype=torch.long, device=self.device)
	with torch.no_grad():
	logits, _ = self.model(current_input)

	# Get next token logits and apply temperature
	next_token_logits = logits[0, -1, :] / temperature
	probs = torch.softmax(next_token_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1).item()

	generated.append(next_token)

	# Decode only the generated part
	original_length = input_tensor.size(1)
	generated_tokens = generated[original_length:]
	return self.tokenizer.decode(generated_tokens)

	def get_info(self) -> Dict[str, Any]:
	"""Get model information."""
	return {
	"model_name": self.config.get("model_name", "OpenLLM"),
	"model_size": self.config.get("model_size", "unknown"),
	"parameters": self.config.get("n_embd", 0)
	* self.config.get("n_layer", 0), # Approximate
	"vocab_size": self.config.get("vocab_size", self.tokenizer.vocab_size()),
	"max_length": self.config.get("block_size", 1024),
	"format": self.detected_format,
	"loaded_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.loaded_at)),
	}

	def get_health(self) -> Dict[str, Any]:
	"""Get health status."""
	return {
	"status": "healthy",
	"model_loaded": self.model is not None,
	"uptime_seconds": time.time() - self.loaded_at,
	"total_requests": self.total_requests,
	}


	# Global inference engine
	inference_engine: Optional[OpenLLMInference] = None

	# FastAPI app
	app = FastAPI(
	title="OpenLLM Inference API",
	description="REST API for OpenLLM text generation",
	version="0.1.0",
	docs_url="/docs",
	redoc_url="/redoc",
	)

	# CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Configure appropriately for production
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	@app.on_event("startup")
	async def startup_event():
	"""Initialize inference engine on startup."""
	print("🚀 Starting OpenLLM Inference Server...")
	# Note: Model loading is handled in main() function
	# For testing, we'll create a mock model if none exists
	global inference_engine
	if inference_engine is None:
	print("⚠️ No model loaded - server will return 503 for generation requests")
	print(" Use main() function to load a real model")
	print(" For testing, use load_model_for_testing() function")


	@app.post("/generate", response_model=GenerationResponse)
	async def generate_text(request: GenerationRequest, background_tasks: BackgroundTasks):
	"""Generate text from prompt."""
	if inference_engine is None:
	raise HTTPException(status_code=503, detail="Model not loaded")

	start_time = time.time()

	try:
	# Generate text
	generated_texts = inference_engine.generate(
	prompt=request.prompt,
	max_length=request.max_length,
	temperature=request.temperature,
	top_k=request.top_k,
	top_p=request.top_p,
	num_return_sequences=request.num_return_sequences,
	stop_sequences=request.stop_sequences,
	)

	generation_time = time.time() - start_time

	return GenerationResponse(
	generated_text=generated_texts,
	prompt=request.prompt,
	generation_time=generation_time,
	parameters={
	"max_length": request.max_length,
	"temperature": request.temperature,
	"top_k": request.top_k,
	"top_p": request.top_p,
	"num_return_sequences": request.num_return_sequences,
	},
	)

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")


	@app.post("/generate/stream")
	async def generate_text_stream(request: GenerationRequest):
	"""Generate text with streaming response."""
	if inference_engine is None:
	raise HTTPException(status_code=503, detail="Model not loaded")

	try:
	# For now, return a simple streaming response
	# In a real implementation, this would stream tokens as they're generated
	generated_texts = inference_engine.generate(
	prompt=request.prompt,
	max_length=request.max_length,
	temperature=request.temperature,
	top_k=request.top_k,
	top_p=request.top_p,
	num_return_sequences=request.num_return_sequences,
	stop_sequences=request.stop_sequences,
	)

	# Return as streaming response
	return {
	"generated_text": generated_texts,
	"prompt": request.prompt,
	"streaming": True,
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")


	@app.get("/info", response_model=ModelInfo)
	async def get_model_info():
	"""Get model information."""
	if inference_engine is None:
	raise HTTPException(status_code=503, detail="Model not loaded")

	info = inference_engine.get_info()
	return ModelInfo(**info)


	@app.get("/health", response_model=HealthResponse)
	async def health_check():
	"""Health check endpoint."""
	if inference_engine is None:
	return HealthResponse(
	status="unhealthy", model_loaded=False, uptime_seconds=0.0, total_requests=0
	)

	health = inference_engine.get_health()
	return HealthResponse(**health)


	@app.get("/")
	async def root():
	"""Root endpoint."""
	return {
	"message": "OpenLLM Inference API",
	"version": "0.1.0",
	"docs": "/docs",
	"health": "/health",
	"info": "/info",
	"endpoints": ["/generate", "/generate/stream", "/health", "/info"],
	}


	def main():
	"""Main server function."""
	parser = argparse.ArgumentParser(
	description="OpenLLM Inference Server",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Start server with Hugging Face model
	python core/src/inference_server.py \\
	--model_path exports/huggingface/ \\
	--host 0.0.0.0 \\
	--port 8000

	# Start server with ONNX model
	python core/src/inference_server.py \\
	--model_path exports/onnx/ \\
	--format onnx \\
	--port 8001
	""",
	)

	parser.add_argument(
	"--model_path",
	required=True,
	help="Path to exported model directory",
	)

	parser.add_argument(
	"--format",
	choices=["pytorch", "huggingface", "onnx", "auto"],
	default="auto",
	help="Model format (default: auto-detect)",
	)

	parser.add_argument(
	"--host",
	default="127.0.0.1",
	help="Host to bind to (default: 127.0.0.1)",
	)

	parser.add_argument(
	"--port",
	type=int,
	default=8000,
	help="Port to bind to (default: 8000)",
	)

	parser.add_argument(
	"--max_length",
	type=int,
	default=512,
	help="Maximum generation length (default: 512)",
	)

	args = parser.parse_args()

	# Initialize inference engine
	global inference_engine
	inference_engine = OpenLLMInference(args.model_path, args.format)

	# Start server
	print(f"🚀 Starting server on {args.host}:{args.port}")
	uvicorn.run(
	app,
	host=args.host,
	port=args.port,
	log_level="info",
	)


	def load_model(model_path: str, model_format: str = "auto"):
	"""
	Load model for testing purposes.

	This function is used by tests to load models without starting the full server.

	Args:
	model_path: Path to exported model directory
	model_format: Model format (pytorch, huggingface, onnx, auto)

	Returns:
	OpenLLMInference: Initialized inference engine
	"""
	return OpenLLMInference(model_path, model_format)


	def load_model_for_testing(
	model_path: str = "exports/huggingface", model_format: str = "huggingface"
	):
	"""
	Load a real model for testing purposes.

	This function loads the actual trained model for testing.

	Args:
	model_path: Path to the model directory (default: exports/huggingface)
	model_format: Model format (default: huggingface)

	Returns:
	OpenLLMInference: Real inference engine with loaded model
	"""
	global inference_engine
	try:
	inference_engine = OpenLLMInference(model_path, model_format)
	print(f"✅ Real model loaded for testing from {model_path}")
	return inference_engine
	except Exception as e:
	print(f"❌ Failed to load real model: {e}")
	# Fallback to mock model for testing
	return create_test_model()


	def create_test_model():
	"""
	Create a real lightweight test model for testing purposes.

	This creates a real model with minimal parameters for testing,
	without requiring large model files to be downloaded.

	Returns:
	OpenLLMInference: Real lightweight inference engine
	"""
	try:
	# Create a real model with minimal parameters
	import sentencepiece as smp
	from model import GPTConfig, GPTModel

	# Create minimal config for testing
	config = GPTConfig.small()
	config.n_embd = 128 # Very small for testing
	config.n_layer = 2 # Very small for testing
	config.vocab_size = 1000 # Small vocabulary
	config.block_size = 64 # Small context

	# Create real model
	model = GPTModel(config)
	model.eval()

	# Create minimal tokenizer
	class MinimalTokenizer:
	def __init__(self):
	self.vocab_size = 1000

	def encode(self, text):
	# Simple character-based encoding for testing
	return [ord(c) % 1000 for c in text[:50]] # Limit to 50 chars

	def decode(self, tokens):
	# Simple character-based decoding for testing
	return "".join([chr(t % 256) for t in tokens if t < 256])

	def vocab_size(self):
	return 1000

	# Create real inference engine with lightweight model
	class LightweightInferenceEngine:
	def __init__(self):
	self.model = model
	self.tokenizer = MinimalTokenizer()
	self.config = {
	"model_name": "openllm-small-test",
	"model_size": "small",
	"n_embd": config.n_embd,
	"n_layer": config.n_layer,
	"vocab_size": config.vocab_size,
	"block_size": config.block_size,
	}
	self.detected_format = "pytorch"
	self.device = "cpu"
	self.loaded_at = time.time()
	self.total_requests = 0

	def generate(self, prompt, max_length=10, temperature=0.7, **kwargs):
	"""Real text generation with lightweight model."""
	self.total_requests += 1

	# Tokenize input
	input_ids = self.tokenizer.encode(prompt)
	if len(input_ids) == 0:
	input_ids = [1] # Default token

	# Simple autoregressive generation
	generated = input_ids.copy()
	for _ in range(max_length):
	if len(generated) >= self.config["block_size"]:
	break

	# Create input tensor
	input_tensor = torch.tensor([generated], dtype=torch.long)

	# Forward pass
	with torch.no_grad():
	logits, _ = self.model(input_tensor)

	# Get next token
	next_token_logits = logits[0, -1, :] / temperature
	probs = torch.softmax(next_token_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1).item()

	generated.append(next_token)

	# Decode generated text
	generated_text = self.tokenizer.decode(generated[len(input_ids) :])
	return [generated_text]

	def get_info(self):
	"""Get real model information."""
	return {
	"model_name": "openllm-small-test",
	"model_size": "small",
	"parameters": config.n_embd * config.n_layer * 1000,
	"vocab_size": config.vocab_size,
	"max_length": config.block_size,
	"format": "pytorch",
	"loaded_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.loaded_at)),
	}

	def get_health(self):
	"""Get real health status."""
	return {
	"status": "healthy",
	"model_loaded": True,
	"uptime_seconds": time.time() - self.loaded_at,
	"total_requests": self.total_requests,
	}

	return LightweightInferenceEngine()

	except Exception as e:
	print(f"⚠️ Failed to create lightweight model: {e}")

	# Fallback to simple mock if real model creation fails
	class SimpleMockInferenceEngine:
	def __init__(self):
	self.model = "simple_mock"
	self.tokenizer = "simple_mock"
	self.config = {"model_name": "fallback-model"}
	self.detected_format = "pytorch"
	self.device = "cpu"
	self.loaded_at = time.time()
	self.total_requests = 0

	def generate(self, prompt, **kwargs):
	self.total_requests += 1
	return [f"Generated: {prompt[:10]}..."]

	def get_info(self):
	return {
	"model_name": "fallback-model",
	"model_size": "small",
	"parameters": 1000,
	"vocab_size": 1000,
	"max_length": 100,
	"format": "pytorch",
	"loaded_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.loaded_at)),
	}

	def get_health(self):
	return {
	"status": "healthy",
	"model_loaded": True,
	"uptime_seconds": time.time() - self.loaded_at,
	"total_requests": self.total_requests,
	}

	return SimpleMockInferenceEngine()


	if __name__ == "__main__":
	main()