| import os |
| from typing import Optional |
| from cog import BasePredictor, Input, Path |
| from llama_cpp import Llama |
|
|
| class Predictor(BasePredictor): |
| def setup(self) -> None: |
| """Load the model into memory""" |
| model_path = "monad-mistral-7b.gguf" |
| |
| |
| if not os.path.exists(model_path): |
| print(f"Model not found at {model_path}") |
| |
| |
| self.llm = Llama( |
| model_path=model_path, |
| n_ctx=4096, |
| n_threads=8, |
| n_gpu_layers=-1, |
| verbose=False |
| ) |
| |
| |
| self.default_params = { |
| "max_tokens": 512, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "top_k": 40, |
| "repeat_penalty": 1.1 |
| } |
| |
| def predict( |
| self, |
| prompt: str = Input( |
| description="Input prompt", |
| default="What is Monad blockchain?" |
| ), |
| system_prompt: str = Input( |
| description="System prompt to guide the model's behavior", |
| default="You are an expert on Monad blockchain technology. Provide accurate, helpful information about Monad's architecture, ecosystem, and capabilities." |
| ), |
| max_tokens: int = Input( |
| description="Maximum number of tokens to generate", |
| default=512, |
| ge=1, |
| le=4096 |
| ), |
| temperature: float = Input( |
| description="Temperature for sampling", |
| default=0.7, |
| ge=0.1, |
| le=2.0 |
| ), |
| top_p: float = Input( |
| description="Top-p sampling parameter", |
| default=0.9, |
| ge=0.1, |
| le=1.0 |
| ), |
| top_k: int = Input( |
| description="Top-k sampling parameter", |
| default=40, |
| ge=1, |
| le=100 |
| ), |
| repeat_penalty: float = Input( |
| description="Penalty for repeated tokens", |
| default=1.1, |
| ge=1.0, |
| le=2.0 |
| ), |
| seed: int = Input( |
| description="Random seed for reproducibility", |
| default=-1 |
| ) |
| ) -> str: |
| """Run inference on the model""" |
| |
| |
| if system_prompt: |
| formatted_prompt = f"[INST] {system_prompt}\n\n{prompt} [/INST]" |
| else: |
| formatted_prompt = f"[INST] {prompt} [/INST]" |
| |
| |
| if seed > 0: |
| import random |
| import numpy as np |
| random.seed(seed) |
| np.random.seed(seed) |
| |
| |
| output = self.llm( |
| formatted_prompt, |
| max_tokens=max_tokens, |
| temperature=temperature, |
| top_p=top_p, |
| top_k=top_k, |
| repeat_penalty=repeat_penalty, |
| stop=["</s>", "[INST]", "[/INST]"], |
| echo=False |
| ) |
| |
| return output['choices'][0]['text'].strip() |