File size: 6,512 Bytes
88eee7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
#!/usr/bin/env python3
"""
Optimized inference script for GGUF models
Supports llama-cpp-python for maximum speed
"""
import argparse
import time
from pathlib import Path
import multiprocessing
try:
from llama_cpp import Llama
LLAMA_CPP_AVAILABLE = True
except ImportError:
LLAMA_CPP_AVAILABLE = False
print("llama-cpp-python not available.")
print("Install with: pip install llama-cpp-python")
class FastInference:
"""Optimized inference class for GGUF models"""
def __init__(self, model_path: str, n_ctx: int = 4096, n_threads: int = -1):
self.model_path = model_path
if not LLAMA_CPP_AVAILABLE:
raise ImportError("llama-cpp-python required for GGUF inference")
# Use all CPU threads if not specified
if n_threads == -1:
n_threads = multiprocessing.cpu_count()
# Initialize model with optimized settings
self.model = Llama(
model_path=model_path,
n_ctx=n_ctx,
n_threads=n_threads,
n_batch=512, # Batch size for prompt processing
n_gpu_layers=-1 if self._has_gpu() else 0, # Use GPU if available
use_mmap=True, # Memory-mapped files
use_mlock=True, # Lock memory
verbose=False
)
print(f"Model loaded: {model_path}")
print(f"Context length: {n_ctx}")
print(f"Threads: {n_threads}")
print(f"GPU layers: {-1 if self._has_gpu() else 0}")
def _has_gpu(self) -> bool:
"""Check if GPU is available"""
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
"""Generate text with optimized settings"""
start_time = time.time()
# Optimized generation parameters
response = self.model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
repeat_penalty=1.1,
stop=["</code>", "\n\n\n"], # Stop sequences
stream=False
)
generation_time = time.time() - start_time
generated_text = response['choices'][0]['text']
# Calculate tokens per second
estimated_tokens = len(generated_text.split())
tokens_per_sec = estimated_tokens / generation_time if generation_time > 0 else 0
print(f"\nπ Performance:")
print(f" Time: {generation_time:.2f}s")
print(f" Speed: {tokens_per_sec:.1f} tokens/sec")
print(f" Tokens: {estimated_tokens}")
return generated_text
def generate_stream(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7):
"""Generate text with streaming"""
print("\nπ Streaming response:")
start_time = time.time()
total_tokens = 0
stream = self.model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
repeat_penalty=1.1,
stop=["</code>", "\n\n\n"],
stream=True
)
for chunk in stream:
text = chunk['choices'][0]['text']
print(text, end='', flush=True)
total_tokens += 1
generation_time = time.time() - start_time
tokens_per_sec = total_tokens / generation_time if generation_time > 0 else 0
print(f"\n\nπ Streaming Performance:")
print(f" Time: {generation_time:.2f}s")
print(f" Speed: {tokens_per_sec:.1f} tokens/sec")
def chat_mode(self):
"""Interactive chat mode"""
print("\nπ€ Interactive Chat Mode")
print("Commands: 'exit' to quit, 'stream' to toggle streaming")
print("-" * 50)
use_streaming = False
while True:
try:
prompt = input("\nπ€ You: ")
if prompt.lower() == 'exit':
print("π Goodbye!")
break
elif prompt.lower() == 'stream':
use_streaming = not use_streaming
print(f"π Streaming {'enabled' if use_streaming else 'disabled'}")
continue
print("π€ Assistant:", end=" ")
if use_streaming:
self.generate_stream(prompt)
else:
response = self.generate(prompt)
print(response)
except KeyboardInterrupt:
print("\n\nπ Goodbye!")
break
def main():
parser = argparse.ArgumentParser(description="Fast GGUF Model Inference")
parser.add_argument("--model", required=True, help="Path to GGUF model file")
parser.add_argument("--prompt", help="Text prompt for generation")
parser.add_argument("--max-tokens", type=int, default=512, help="Maximum tokens to generate")
parser.add_argument("--temperature", type=float, default=0.7, help="Generation temperature")
parser.add_argument("--ctx-size", type=int, default=4096, help="Context size")
parser.add_argument("--threads", type=int, default=-1, help="Number of threads (-1 for auto)")
parser.add_argument("--interactive", action="store_true", help="Start interactive chat mode")
parser.add_argument("--stream", action="store_true", help="Use streaming generation")
args = parser.parse_args()
# Initialize inference
print(f"π Loading model: {args.model}")
inferencer = FastInference(
args.model,
n_ctx=args.ctx_size,
n_threads=args.threads
)
if args.interactive:
inferencer.chat_mode()
elif args.prompt:
if args.stream:
inferencer.generate_stream(args.prompt, args.max_tokens, args.temperature)
else:
response = inferencer.generate(args.prompt, args.max_tokens, args.temperature)
print("\nπ€ Generated text:")
print(response)
else:
print("Please provide --prompt or use --interactive mode")
print("Example: python fast_inference.py --model model.gguf --prompt 'def hello():' --interactive")
if __name__ == "__main__":
main()
|