File size: 8,527 Bytes
43cc365 2d70789 048628f 01e79df 8a98314 048628f 43cc365 cb82ced 43cc365 a49d7b2 43cc365 8a98314 191a9f9 8a98314 191a9f9 8a98314 191a9f9 43cc365 8a98314 43cc365 a49d7b2 191a9f9 8a98314 a49d7b2 191a9f9 96c2f2d 191a9f9 01e79df 191a9f9 8a98314 191a9f9 8a98314 191a9f9 8a98314 191a9f9 280099a 01e79df 8a98314 01e79df 280099a 01e79df 280099a 01e79df 280099a 01e79df 43cc365 8a98314 280099a 43cc365 2d70789 191a9f9 8a98314 2d70789 a49d7b2 191a9f9 a49d7b2 191a9f9 a49d7b2 2d70789 ab6809d 8afce56 191a9f9 8a98314 191a9f9 a49d7b2 191a9f9 8ac0dd3 a49d7b2 191a9f9 a49d7b2 8ac0dd3 191a9f9 8ac0dd3 191a9f9 ab6809d 191a9f9 a49d7b2 ab6809d 43cc365 191a9f9 280099a 191a9f9 9ee58dc 43cc365 9ee58dc 314bed8 9ee58dc 8a98314 191a9f9 1fb027f 280099a b03785b 9ee58dc b03785b be3fe73 9ee58dc b03785b 280099a 9ee58dc 280099a 191a9f9 280099a 9ee58dc 01e79df 9ee58dc 8a98314 191a9f9 280099a 9ee58dc 191a9f9 347340b 8a98314 191a9f9 9ee58dc 347340b 191a9f9 9ee58dc 8a98314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
import tiktoken # For estimating token count
import logging # Import the logging module
# === Configure Logging ===
# Get the root logger
logger = logging.getLogger(__name__)
# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
logger.setLevel(logging.INFO)
# Create a console handler and set its format
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# Add the handler to the logger if it's not already added
if not logger.handlers:
logger.addHandler(handler)
app = FastAPI()
# === Model Config ===
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
logger.info(f"✅ Model downloaded to: {model_path}")
except Exception as e:
logger.error(f"❌ Error downloading model: {e}")
# Exit or handle error appropriately if model download fails
exit(1)
else:
logger.info(f"✅ Model already available at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
logger.info(f"Using n_threads: {recommended_threads}")
# === Load the model ===
try:
llm = Llama(
model_path=model_path,
n_ctx=2048, # Context window size for the model (still needed, but not fully utilized for history)
n_threads=recommended_threads,
use_mlock=True, # Lock model in RAM for faster access
n_gpu_layers=0, # CPU only
chat_format="chatml", # TinyLlama Chat uses ChatML format
verbose=False # Keep llama.cpp's internal verbose logging off
)
logger.info("� Llama model loaded successfully!")
except Exception as e:
logger.error(f"❌ Error loading Llama model: {e}")
exit(1)
# Initialize tiktoken encoder for token counting
try:
encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
encoding = None
def count_tokens_in_text(text):
"""Estimates tokens in a given text using tiktoken or simple char count."""
if encoding:
return len(encoding.encode(text))
else:
# Fallback for when tiktoken isn't available or for simple estimation
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
@app.get("/")
def root():
logger.info("Root endpoint accessed.")
return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
@app.get("/get_sys")
def get_sys_specs():
"""Returns system specifications including CPU, RAM, and OS details."""
logger.info("System specs endpoint accessed.")
memory = psutil.virtual_memory()
return {
"CPU": {
"physical_cores": physical_cores,
"logical_cores": logical_cores,
"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
"cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
},
"RAM": {
"total_GB": round(memory.total / (1024 ** 3), 2),
"available_GB": round(memory.available / (1024 ** 3), 2),
"usage_percent": memory.percent
},
"System": {
"platform": platform.platform(),
"architecture": platform.machine(),
"python_version": platform.python_version()
},
"Model_Config": {
"model_name": FILENAME,
"n_ctx": llm.n_ctx(),
"n_threads": llm.n_threads(),
"use_mlock": llm.use_mlock()
}
}
@app.get("/process_list")
def process_list():
"""Returns a list of processes consuming significant CPU."""
logger.info("Process list endpoint accessed.")
time.sleep(1) # Let CPU settle for accurate measurement
processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
try:
cpu = proc.cpu_percent()
mem = proc.memory_percent()
# Filter processes using more than 5% CPU or 2% memory
if cpu > 5 or mem > 2:
processes.append({
"pid": proc.pid,
"name": proc.name(),
"cpu_percent": round(cpu, 2),
"memory_percent": round(mem, 2)
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
# Sort by CPU usage descending
processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
return {"heavy_processes": processes}
@app.post("/generate")
async def generate(request: Request):
"""
Generates a response from the LLM without retaining chat context.
Expects a JSON body with 'prompt'.
"""
logger.info("➡️ /generate endpoint received a request.")
data = await request.json()
user_input = data.get("prompt", "").strip() # Renamed to user_input for clarity
if not user_input:
logger.warning("Prompt cannot be empty in /generate request.")
return {"error": "Prompt cannot be empty"}, 400
# Define the system prompt - sent with every request
system_prompt_content = (
"You are a highly efficient and objective data analysis API. You are the 'assistant'. "
"Your sole function is to process the user's data and instructions, then output ONLY the requested analysis in the specified format. "
"**Crucially, do NOT include any conversational text, greetings, introductions, conclusions, or any remarks about being an AI.** "
"Respond directly with the content. Adhere strictly to all formatting requirements. "
"If a request cannot be fulfilled, respond ONLY with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.'"
)
# === FIX: Wrap user input in a clear instruction to prevent role confusion ===
# This frames the user's text as 'data' for the model to analyze.
user_content_template = f"""Please analyze the following data based on the instructions within it.
Provide only the direct output as requested. Do not add any extra conversational text.
--- DATA ---
{user_input}
"""
# Construct messages for the current request only
messages_for_llm = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": user_content_template} # Use the new template
]
# Calculate tokens in the user's prompt
prompt_tokens = count_tokens_in_text(user_input)
logger.info(f"🧾 Original user input: {user_input}")
logger.info(f"Tokens in prompt: {prompt_tokens}")
try:
response = llm.create_chat_completion(
messages=messages_for_llm,
max_tokens=800,
# === FIX: Lower temperature for more factual, less creative output ===
temperature=0.2,
# === FIX: Use the CORRECT stop token for the chatml format ===
stop=["<|im_end|>"]
)
ai_response_content = response["choices"][0]["message"]["content"].strip()
response_token_count = count_tokens_in_text(ai_response_content)
logger.info("✅ Response generated successfully.")
return {
"response": ai_response_content,
"prompt_tokens": prompt_tokens,
"response_token_count": response_token_count
}
except Exception as e:
logger.error(f"❌ Error during generation: {e}", exc_info=True)
return {"error": f"Failed to generate response: {e}. Please try again."}, 500 |