Spaces:
Running
Running
File size: 12,905 Bytes
0664808 27741ca 0664808 20dcaab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
import os
import threading
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.generation.utils import DynamicCache
DynamicCache.get_max_length = DynamicCache.get_max_cache_shape
# Check if llama-cpp-python is available
def check_llamacpp_available():
try:
import llama_cpp
return True
except ImportError:
return False
# Global cache for model and tokenizer
MODEL_CACHE = {}
def load_text_model(model_name, quantize=False):
"""
Load text model with appropriate configuration for CPU or GPU
Args:
model_name (str): Hugging Face model ID
quantize (bool): Whether to use 4-bit quantization (only works with GPU)
Returns:
tuple: (model, tokenizer)
"""
# Check cache first
cache_key = f"{model_name}_{quantize}"
if cache_key in MODEL_CACHE:
return MODEL_CACHE[cache_key]
# Check CUDA availability
cuda_available = torch.cuda.is_available()
# Only try quantization if CUDA is available
if quantize and cuda_available:
try:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
except Exception as e:
print(f"Quantization config creation failed: {e}")
quantization_config = None
quantize = False
else:
quantization_config = None
quantize = False
# Try loading the model
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Fix for attention mask warning
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Try with quantization first if requested and available
if quantize and quantization_config:
try:
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True
)
except Exception as e:
print(f"Failed to load with quantization: {e}")
quantize = False
# If quantization is not used or failed, try standard loading
if not quantize:
# For CPU, just load without specifing dtype
if not cuda_available:
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
trust_remote_code=True
)
else:
# Try different dtypes for GPU
for dtype in (torch.float16, torch.float32):
try:
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=dtype,
device_map="auto",
trust_remote_code=True
)
break
except Exception as e:
if dtype == torch.float32:
# Last resort: try without specifying dtype
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
trust_remote_code=True
)
# Cache the loaded model and tokenizer
MODEL_CACHE[cache_key] = (model, tokenizer)
return model, tokenizer
except Exception as e:
raise RuntimeError(f"Failed to load model {model_name}: {e}")
def format_prompt(tokenizer, query):
"""
Format prompt according to model's requirements
Args:
tokenizer: The model tokenizer
query (str): User query
Returns:
str: Formatted prompt
"""
enhanced_query = f"Please answer this question about pharmaceuticals or medical topics.\n\nQuestion: {query}"
# Use chat template if available
if hasattr(tokenizer, "apply_chat_template") and callable(getattr(tokenizer, "apply_chat_template")):
messages = [{"role": "user", "content": enhanced_query}]
try:
formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
return formatted
except:
# Fallback if chat template fails
pass
# Simple formatting fallback
return f"User: {enhanced_query}\nAssistant:"
def generate_text_with_transformers(model, tokenizer, query, max_tokens=512, temperature=0.7,
top_p=0.9, repetition_penalty=1.1, cancel_event=None,
progress_callback=None):
"""
Generate text using the transformers pipeline
Args:
model: The language model
tokenizer: The tokenizer
query (str): User query
max_tokens (int): Maximum tokens to generate
temperature (float): Temperature for sampling
top_p (float): Top-p sampling parameter
repetition_penalty (float): Penalty for repetition
cancel_event (threading.Event): Event to signal cancellation
progress_callback (callable): Function to report progress
Returns:
str: Generated response
"""
# Format the prompt
prompt = format_prompt(tokenizer, query)
# Prepare inputs
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Update progress
if progress_callback:
progress_callback(0.2, "Starting generation...")
try:
from transformers import TextIteratorStreamer
# Set up streamer for token-by-token generation
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Prepare generation parameters
generation_kwargs = {
"input_ids": inputs.input_ids,
"attention_mask": inputs.attention_mask, # Explicitly provide attention mask
"max_new_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": temperature > 0.1,
"streamer": streamer
}
# Start generation in a separate thread
generation_thread = threading.Thread(
target=model.generate,
kwargs=generation_kwargs
)
generation_thread.start()
# Collect tokens as they're generated
response_text = ""
for i, new_text in enumerate(streamer):
if cancel_event and cancel_event.is_set():
break
response_text += new_text
# Update progress periodically
if progress_callback and i % 5 == 0:
progress_callback(0.3 + min(0.6, len(response_text) / 500), "Generating response...")
return response_text
except Exception as e:
print(f"Streaming generation failed, falling back to standard generation: {e}")
# Fallback to standard generation
try:
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=temperature > 0.1,
)
# Decode and remove prompt
prompt_length = inputs.input_ids.shape[1]
response = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
return response
except Exception as e2:
return f"Error in text generation: {e2}"
# Global llamacpp model cache
LLAMA_MODEL = None
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
def load_llamacpp_model(model_path=None):
"""Load the llama.cpp model, downloading from HF Hub if needed."""
global LLAMA_MODEL
# Return cached model if available
if LLAMA_MODEL is not None:
return LLAMA_MODEL
# 1) Look for existing file on disk
if model_path is None:
possible_paths = [
"models/Phi-3-mini-4k-instruct.Q4_K_M.gguf",
os.path.join(os.path.dirname(os.path.dirname(__file__)), "models/Phi-3-mini-4k-instruct.Q4_K_M.gguf"),
"/models/Phi-3-mini-4k-instruct.Q4_K_M.gguf",
os.path.expanduser("~/.cache/huggingface/hub/models/Phi-3-mini-4k-instruct.Q4_K_M.gguf"),
]
for p in possible_paths:
if os.path.exists(p):
model_path = p
break
# 2) If still not found, download into models/
if model_path is None:
print("→ GGUF not found locally, downloading from HF Hub…")
model_path = hf_hub_download(
repo_id="MohammedSameerSyed/phi3-gguf", # <— YOUR HF repo with the .gguf
filename="Phi-3-mini-4k-instruct.Q4_K_M.gguf",
cache_dir="models", # will create models/ if needed
)
# 3) Finally load with llama.cpp
try:
LLAMA_MODEL = Llama(
model_path=model_path,
n_ctx=4096, # full 4K context
n_batch=512,
n_threads=4,
n_gpu_layers=0
)
return LLAMA_MODEL
except Exception as e:
raise RuntimeError(f"Failed to load llama.cpp model: {e}")
def generate_text_with_llamacpp(query, max_tokens=512, temperature=0.7, top_p=0.9,
stop=None, cancel_event=None, progress_callback=None, model_path=None):
"""
Generate text using llama.cpp
Args:
query (str): User query
max_tokens (int): Maximum tokens to generate
temperature (float): Temperature for sampling
top_p (float): Top-p sampling parameter
stop (list): List of stop sequences
cancel_event (threading.Event): Event to signal cancellation
progress_callback (callable): Function to report progress
model_path (str): Path to GGUF model file (optional)
Returns:
str: Generated response
"""
if progress_callback:
progress_callback(0.1, "Loading llama.cpp model...")
# Load model
try:
model = load_llamacpp_model(model_path)
except Exception as e:
raise RuntimeError(f"Failed to load llama.cpp model: {e}")
if progress_callback:
progress_callback(0.3, "Starting generation...")
# Format prompt
prompt = f"You are a helpful pharmaceutical assistant. Please answer this question about medications or medical topics.\n\nQuestion: {query}\n\nAnswer:"
# Define stop sequences if not provided
if stop is None:
stop = ["Question:", "\n\n"]
try:
# Check if create_completion method exists (newer versions)
if hasattr(model, "create_completion"):
# Stream response
response_text = ""
# Generate completion with streaming
stream = model.create_completion(
prompt,
max_tokens=1024,
temperature=temperature,
top_p=top_p,
top_k=40,
stop=None,
stream=True
)
# Process stream
for i, chunk in enumerate(stream):
if cancel_event and cancel_event.is_set():
break
text_chunk = chunk["choices"][0]["text"]
response_text += text_chunk
# Update progress periodically
if progress_callback and i % 5 == 0:
progress_callback(0.4 + min(0.5, len(response_text) / 500), "Generating response...")
return response_text.strip()
else:
# Fallback to older call method
result = model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=40,
stop=stop,
echo=False
)
if progress_callback:
progress_callback(0.9, "Finalizing...")
return result["choices"][0]["text"].strip()
except Exception as e:
raise RuntimeError(f"Error in llama.cpp generation: {e}") |