Spaces:

FrameRateTech
/

DS-llama-8b-instruct

Paused

App Files Files Community

FrameRateTech commited on Mar 13

Commit

e884311

verified ·

1 Parent(s): 5e75927

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -232

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py
 import os
 import gc
@@ -9,11 +9,9 @@ import transformers
 import torch
 import gradio as gr
 from transformers import (
-    AutoTokenizer,
     AutoModelForCausalLM,
-    GenerationConfig,
-    BitsAndBytesConfig,
-    LlamaTokenizer  # Added direct import for LlamaTokenizer
 )
 ###############################################################################
@@ -36,26 +34,16 @@ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful, and honest assistant.
 If a question is not clear or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
 ###############################################################################
-# Device Configuration and Memory Management
 ###############################################################################
-def get_device_info():
-    """Log information about available devices and memory"""
-    device_info = {
-        "cuda_available": torch.cuda.is_available(),
-        "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
-        "mps_available": hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-    }
-    if device_info["cuda_available"] and device_info["device_count"] > 0:
-        device_info["cuda_device_name"] = torch.cuda.get_device_name(0)
-        device_info["cuda_device_mem_total"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
-        device_info["cuda_device_mem_reserved"] = torch.cuda.memory_reserved(0) / (1024**3)
-        device_info["cuda_device_mem_allocated"] = torch.cuda.memory_allocated(0) / (1024**3)
-    logger.info(f"Device information: {device_info}")
-    return device_info
 def optimize_memory():
     """Optimize memory usage by clearing caches and forcing garbage collection"""
     if torch.cuda.is_available():
@@ -64,106 +52,78 @@ def optimize_memory():
     logger.info("Memory optimized: caches cleared and garbage collected")
 ###############################################################################
-# Model Loading with Error Handling
 ###############################################################################
-def load_model_and_tokenizer():
-    """Load the model and tokenizer with comprehensive error handling and logging"""
-    logger.info(f"Loading model: {MODEL_ID}")
-    logger.info(f"Transformers version: {transformers.__version__}")
-    logger.info(f"PyTorch version: {torch.__version__}")
-    device_info = get_device_info()
-    # Determine quantization settings based on available hardware
-    load_in_4bit = False
-    load_in_8bit = False
-    if device_info["cuda_available"]:
-        # On ZEROGPU environments, 4-bit quantization helps fit the model in memory
-        load_in_4bit = True
-        logger.info("Using 4-bit quantization for CUDA device")
-    # Configure quantization if needed
-    if load_in_4bit:
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True
-        )
-        logger.info("Configured 4-bit quantization with NF4 type")
-    elif load_in_8bit:
-        quantization_config = BitsAndBytesConfig(
-            load_in_8bit=True
-        )
-        logger.info("Configured 8-bit quantization")
-    else:
-        quantization_config = None
-        logger.info("No quantization configured, using default precision")
-    # Step 1: Load tokenizer with direct class instantiation for Llama models
-    try:
-        logger.info("Loading tokenizer...")
-        tokenizer_start = time.time()
-        # First, try loading directly as a LlamaTokenizer instead of using AutoTokenizer
-        try:
-            logger.info("Attempting to load as LlamaTokenizer...")
-            tokenizer = LlamaTokenizer.from_pretrained(
-                MODEL_ID,
-                use_fast=False,
-                trust_remote_code=True
-            )
-            logger.info("Successfully loaded tokenizer as LlamaTokenizer")
-        except Exception as e:
-            logger.warning(f"Failed to load as LlamaTokenizer: {str(e)}")
-            logger.info("Falling back to AutoTokenizer...")
-            # Try with AutoTokenizer but with strict error checking
-            tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_ID,
-                use_fast=False,
-                trust_remote_code=True
-            )
-        # Check if tokenizer is a valid object
-        if tokenizer is None or isinstance(tokenizer, bool):
-            logger.error(f"Tokenizer loaded as {type(tokenizer).__name__} (value: {tokenizer})")
-            logger.info("Attempting to create a basic LlamaTokenizer...")
-            # Last resort: Create a basic LlamaTokenizer with default config
-            tokenizer = LlamaTokenizer.from_pretrained(
-                "meta-llama/Llama-3.1-8B-Instruct",  # Use base model as fallback
-                use_fast=False
-            )
-            logger.info("Created fallback tokenizer from base model")
-        tokenizer_load_time = time.time() - tokenizer_start
-        logger.info(f"Tokenizer loaded successfully in {tokenizer_load_time:.2f} seconds")
-        logger.info(f"Tokenizer type: {type(tokenizer).__name__}")
-        # Set pad token if needed
-        if getattr(tokenizer, "pad_token_id", None) is None:
-            logger.info("Pad token not found, setting pad_token_id to eos_token_id")
-            tokenizer.pad_token_id = getattr(tokenizer, "eos_token_id", None)
-        # Log important tokenizer properties if possible
-        try:
-            tokenizer_info = {
-                "vocab_size": len(tokenizer.get_vocab()) if hasattr(tokenizer, "get_vocab") else "unknown",
-                "model_max_length": tokenizer.model_max_length if hasattr(tokenizer, "model_max_length") else "unknown",
-                "bos_token": tokenizer.bos_token if hasattr(tokenizer, "bos_token") else "unknown",
-                "eos_token": tokenizer.eos_token if hasattr(tokenizer, "eos_token") else "unknown",
-                "has_chat_template": hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None
             }
-            logger.info(f"Tokenizer properties: {tokenizer_info}")
-        except Exception as e:
-            logger.warning(f"Could not log all tokenizer properties: {str(e)}")
-    except Exception as e:
-        logger.error(f"Failed to load tokenizer: {str(e)}")
-        logger.error(traceback.format_exc())
-        raise RuntimeError(f"Failed to load tokenizer: {str(e)}")
     # Step 2: Load model with detailed error logging
     try:
@@ -184,25 +144,28 @@ def load_model_and_tokenizer():
             torch_dtype = torch.float32
             logger.info("Using CPU with float32 precision")
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch_dtype,
             device_map=device_map,
             trust_remote_code=True,
-            quantization_config=quantization_config
         )
         model.eval()
         model_load_time = time.time() - model_start
         logger.info(f"Model loaded successfully in {model_load_time:.2f} seconds")
         # Log model info
-        model_info = {
-            "model_type": model.config.model_type,
-            "hidden_size": model.config.hidden_size,
-            "vocab_size": model.config.vocab_size,
-            "num_hidden_layers": model.config.num_hidden_layers
-        }
-        logger.info(f"Model properties: {model_info}")
     except Exception as e:
         logger.error(f"Failed to load model: {str(e)}")
@@ -214,66 +177,33 @@ def load_model_and_tokenizer():
 ###############################################################################
 # Chat Formatting and Generation Functions
 ###############################################################################
-def format_chat_for_model(messages, tokenizer, system_prompt=DEFAULT_SYSTEM_PROMPT):
-    """
-    Format chat messages for the model using the tokenizer's chat template if available,
-    or fall back to a manual format for Llama models.
-    """
     logger.info(f"Formatting chat with {len(messages)} messages")
-    # Prepare messages in the correct format
-    formatted_messages = []
     # Add system message if not already present
-    if messages and messages[0].get("role") != "system":
-        formatted_messages.append({"role": "system", "content": system_prompt})
-    # Add user and assistant messages
     for msg in messages:
-        role = msg["role"]
-        # Skip system messages if we already added one
-        if role == "system" and formatted_messages and formatted_messages[0]["role"] == "system":
-            continue
-        formatted_messages.append({"role": role, "content": msg["content"]})
-    # Try different approaches to format the chat
-    # Approach 1: Use the tokenizer's built-in chat template if available
-    if hasattr(tokenizer, "apply_chat_template") and callable(getattr(tokenizer, "apply_chat_template")):
-        logger.info("Using tokenizer's built-in chat template")
-        try:
-            chat_text = tokenizer.apply_chat_template(
-                formatted_messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-            logger.debug(f"Formatted chat using built-in template: {chat_text[:100]}...")
-            return chat_text
-        except Exception as e:
-            logger.warning(f"Failed to apply chat template: {str(e)}")
-            logger.warning("Falling back to manual formatting")
-    # Approach 2: Use a Llama 3.1 specific prompt format based on the config files we've seen
-    # This is based on the special tokens in the model's configuration
-    logger.info("Using manual chat formatting for Llama model")
-    chat_text = "<|begin_of_text|>"
-    for msg in formatted_messages:
         role = msg["role"]
         content = msg["content"]
         if role == "system":
-            chat_text += f"<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>"
         elif role == "user":
-            chat_text += f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>"
         elif role == "assistant":
-            chat_text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>"
-    # Add the final assistant header for generation
-    chat_text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
-    logger.debug(f"Manually formatted chat: {chat_text[:100]}...")
     return chat_text
 def generate_response(model, tokenizer, messages, temperature=0.7, top_p=0.9, max_new_tokens=256, system_prompt=DEFAULT_SYSTEM_PROMPT):
@@ -281,7 +211,7 @@ def generate_response(model, tokenizer, messages, temperature=0.7, top_p=0.9, ma
     logger.info(f"Generating response with temp={temperature}, top_p={top_p}, max_tokens={max_new_tokens}")
     # Format the messages for the model
-    prompt = format_chat_for_model(messages, tokenizer, system_prompt)
     # Configure generation parameters
     gen_config = GenerationConfig(
@@ -290,71 +220,75 @@ def generate_response(model, tokenizer, messages, temperature=0.7, top_p=0.9, ma
         do_sample=True,
         repetition_penalty=1.1,
         max_new_tokens=max_new_tokens,
     )
-    # Tokenize the input
-    try:
-        inputs = tokenizer(prompt, return_tensors="pt")
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        logger.info(f"Input tokenized to {inputs['input_ids'].shape[1]} tokens")
-    except Exception as e:
-        logger.error(f"Error during tokenization: {str(e)}")
-        return "I encountered an error while processing your message. Please try again."
     # Generate with retry logic
     max_retries = 3
     retry_count = 0
     while retry_count < max_retries:
         try:
             # Run the generation
             generation_start = time.time()
             with torch.no_grad():
-                output_ids = model.generate(
                     **inputs,
                     generation_config=gen_config,
                 )
             generation_time = time.time() - generation_start
             logger.info(f"Generation completed in {generation_time:.2f} seconds")
-            # Decode the output
-            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-            # Extract just the assistant's response
-            assistant_response = ""
-            # Try different extraction methods based on the model format
-            # Method 1: Standard extraction for template-based output
-            if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
-                parts = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")
-                if len(parts) > 1:
-                    assistant_part = parts[-1]
-                    if "<|eot_id|>" in assistant_part:
-                        assistant_response = assistant_part.split("<|eot_id|>")[0].strip()
-                    else:
-                        assistant_response = assistant_part.strip()
-            # Method 2: Simple extraction based on prompt length
-            else:
-                # This is a fallback - not as accurate but should work in most cases
-                assistant_response = generated_text[len(prompt):].strip()
-                # If the assistant response seems to have formatting tokens, clean them up
-                for token in ["<|eot_id|>", "<|eom_id|>", "<|end_of_text|>"]:
-                    if token in assistant_response:
-                        assistant_response = assistant_response.split(token)[0].strip()
-            logger.info(f"Response extracted, length: {len(assistant_response)} chars")
-            # If we got an empty response, return a fallback message
-            if not assistant_response.strip():
-                logger.warning("Empty response detected, using fallback message")
-                assistant_response = "I'm sorry, I couldn't generate a proper response. Please try again with a different question or adjust the generation parameters."
             # Free up memory
-            del inputs, output_ids
             optimize_memory()
             return assistant_response
         except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
@@ -523,23 +457,6 @@ def build_gradio_interface(model, tokenizer):
     return demo
-###############################################################################
-# Simple messaging for testing tokenizer
-###############################################################################
-def test_tokenize_function(tokenizer):
-    """Test function to ensure tokenizer works with a simple input"""
-    try:
-        logger.info("Testing tokenizer with a simple input")
-        test_input = "Hello, how are you today?"
-        encoded = tokenizer(test_input, return_tensors="pt")
-        logger.info(f"Tokenizer test successful: encoded to {encoded['input_ids'].shape[1]} tokens")
-        decoded = tokenizer.decode(encoded["input_ids"][0])
-        logger.info(f"Decoded test: '{decoded}'")
-        return True
-    except Exception as e:
-        logger.error(f"Tokenizer test failed: {str(e)}")
-        return False
 ###############################################################################
 # Main Application Logic
 ###############################################################################
@@ -552,11 +469,6 @@ def main():
         # Load model and tokenizer
         model, tokenizer = load_model_and_tokenizer()
-        # Test tokenizer functionality
-        test_result = test_tokenize_function(tokenizer)
-        if not test_result:
-            logger.warning("Tokenizer test failed, but continuing with caution")
         # Build and launch Gradio interface
         demo = build_gradio_interface(model, tokenizer)

+# app.py - Minimal Version
 import os
 import gc
 import torch
 import gradio as gr
 from transformers import (
+    PreTrainedTokenizerFast,
     AutoModelForCausalLM,
+    GenerationConfig
 )
 ###############################################################################
 If a question is not clear or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+# The special tokens we observed in the model's configuration
+BOS_TOKEN = "<|begin_of_text|>"
+EOS_TOKEN = "<|eot_id|>"
+SYSTEM_START = "<|start_header_id|>system<|end_header_id|>\n\n"
+USER_START = "<|start_header_id|>user<|end_header_id|>\n\n"
+ASSISTANT_START = "<|start_header_id|>assistant<|end_header_id|>\n\n"
 ###############################################################################
+# Memory Management
 ###############################################################################
 def optimize_memory():
     """Optimize memory usage by clearing caches and forcing garbage collection"""
     if torch.cuda.is_available():
     logger.info("Memory optimized: caches cleared and garbage collected")
 ###############################################################################
+# Custom Tokenizer Class
 ###############################################################################
+class MinimalTokenizer:
+    """A minimal tokenizer implementation that works with basic model I/O"""
+    def __init__(self):
+        logger.info("Initializing MinimalTokenizer")
+        # Use a basic set of special tokens based on the model config
+        self.bos_token = BOS_TOKEN
+        self.eos_token = EOS_TOKEN
+        self.pad_token = EOS_TOKEN
+        # Map tokens to ids (using values from the model config)
+        self.token_to_id = {
+            BOS_TOKEN: 128000,  # Based on config.json
+            EOS_TOKEN: 128009,  # Based on config.json
+        }
+        # For logging
+        logger.info(f"MinimalTokenizer initialized with special tokens: {self.token_to_id}")
+    def __call__(self, text, return_tensors=None):
+        """Tokenize text using the model directly"""
+        logger.info(f"Tokenizing text (length: {len(text)})")
+        # Create inputs for the model - we'll let the model tokenize internally
+        inputs = {
+            "text": text,
+        }
+        # If return_tensors is specified, create a dummy tensor
+        # The model will handle tokenization internally
+        if return_tensors == "pt":
+            # Create a dummy input_ids tensor with the BOS token
+            # The actual tokenization will happen inside the model
+            dummy_input_ids = torch.tensor([[self.token_to_id[self.bos_token]]])
+            inputs = {
+                "input_ids": dummy_input_ids,
+                "_text": text,  # Store the text for the model to use
             }
+        return inputs
+    def decode(self, token_ids, skip_special_tokens=True):
+        """Dummy decode function - the model will handle decoding"""
+        # This is just a placeholder - the model will decode internally
+        # For logging purposes
+        logger.info(f"Decoding token_ids (shape: {token_ids.shape if hasattr(token_ids, 'shape') else 'N/A'})")
+        # We'll get the raw output from the model and handle it specially
+        # in the generation function
+        return ""
+###############################################################################
+# Model Loading with Error Handling
+###############################################################################
+def load_model_and_tokenizer():
+    """Load the model with comprehensive error handling and logging"""
+    logger.info(f"Loading model: {MODEL_ID}")
+    logger.info(f"Transformers version: {transformers.__version__}")
+    logger.info(f"PyTorch version: {torch.__version__}")
+    # Check available devices
+    device_info = {
+        "cuda_available": torch.cuda.is_available(),
+        "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+        "mps_available": hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+    }
+    logger.info(f"Device information: {device_info}")
+    # Create minimal tokenizer
+    tokenizer = MinimalTokenizer()
     # Step 2: Load model with detailed error logging
     try:
             torch_dtype = torch.float32
             logger.info("Using CPU with float32 precision")
+        # Load the model
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch_dtype,
             device_map=device_map,
             trust_remote_code=True,
         )
         model.eval()
         model_load_time = time.time() - model_start
         logger.info(f"Model loaded successfully in {model_load_time:.2f} seconds")
         # Log model info
+        try:
+            model_info = {
+                "model_type": model.config.model_type,
+                "hidden_size": model.config.hidden_size,
+                "vocab_size": model.config.vocab_size,
+                "num_hidden_layers": model.config.num_hidden_layers
+            }
+            logger.info(f"Model properties: {model_info}")
+        except Exception as e:
+            logger.warning(f"Could not log all model properties: {str(e)}")
     except Exception as e:
         logger.error(f"Failed to load model: {str(e)}")
 ###############################################################################
 # Chat Formatting and Generation Functions
 ###############################################################################
+def format_chat_for_model(messages, system_prompt=DEFAULT_SYSTEM_PROMPT):
+    """Format chat messages using the special tokens from model configuration"""
     logger.info(f"Formatting chat with {len(messages)} messages")
+    # Start with BOS token
+    chat_text = BOS_TOKEN
     # Add system message if not already present
+    if not messages or messages[0].get("role") != "system":
+        chat_text += SYSTEM_START + system_prompt + EOS_TOKEN
+    # Add all messages in the correct format
     for msg in messages:
         role = msg["role"]
         content = msg["content"]
         if role == "system":
+            chat_text += SYSTEM_START + content + EOS_TOKEN
         elif role == "user":
+            chat_text += USER_START + content + EOS_TOKEN
         elif role == "assistant":
+            chat_text += ASSISTANT_START + content + EOS_TOKEN
+    # Add final assistant header for the model to continue
+    chat_text += ASSISTANT_START
+    logger.info(f"Formatted chat text (length: {len(chat_text)})")
     return chat_text
 def generate_response(model, tokenizer, messages, temperature=0.7, top_p=0.9, max_new_tokens=256, system_prompt=DEFAULT_SYSTEM_PROMPT):
     logger.info(f"Generating response with temp={temperature}, top_p={top_p}, max_tokens={max_new_tokens}")
     # Format the messages for the model
+    prompt = format_chat_for_model(messages, system_prompt)
     # Configure generation parameters
     gen_config = GenerationConfig(
         do_sample=True,
         repetition_penalty=1.1,
         max_new_tokens=max_new_tokens,
+        pad_token_id=tokenizer.token_to_id[tokenizer.pad_token],
+        bos_token_id=tokenizer.token_to_id[tokenizer.bos_token],
+        eos_token_id=tokenizer.token_to_id[tokenizer.eos_token],
     )
     # Generate with retry logic
     max_retries = 3
     retry_count = 0
     while retry_count < max_retries:
         try:
+            # Tokenize with dummy tensors - the model will handle the actual text
+            inputs = tokenizer(prompt, return_tensors="pt")
+            inputs["text"] = prompt  # Store the actual text
+            inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
             # Run the generation
             generation_start = time.time()
             with torch.no_grad():
+                outputs = model.generate(
                     **inputs,
                     generation_config=gen_config,
                 )
             generation_time = time.time() - generation_start
             logger.info(f"Generation completed in {generation_time:.2f} seconds")
+            # Extract just the assistant's response using string operations
+            # This is the key part - the model's output is processed as a string, not tokens
+            # Split on the last occurrence of our custom beginning of assistant text
+            # We trust the model to format the output correctly
+            full_text = prompt  # Start with our prompt
+            # Extract actual new text from model's output
+            # The output might be unpredictable, so we need to be careful here
+            try:
+                # Try to get string representation of the output
+                output_text = "".join([chr(id) for id in outputs[0].tolist()])
+                # Remove initial prompt text to get just the model's generation
+                # Add this to the full text
+                full_text += output_text
+            except Exception as e:
+                logger.warning(f"Could not process model output as expected: {str(e)}")
+                # In case of failure, produce a simple response
+                full_text += "I apologize, but I'm having trouble generating a response."
+            # Extract just the final assistant's response
+            try:
+                parts = full_text.split(ASSISTANT_START)
+                assistant_part = parts[-1]  # Get the last assistant part
+                # Remove any trailing EOS token
+                if EOS_TOKEN in assistant_part:
+                    assistant_response = assistant_part.split(EOS_TOKEN)[0].strip()
+                else:
+                    assistant_response = assistant_part.strip()
+            except Exception as e:
+                logger.warning(f"Error extracting assistant response: {str(e)}")
+                assistant_response = "I apologize, but I'm having trouble generating a proper response."
+            logger.info(f"Extracted assistant response (length: {len(assistant_response)})")
             # Free up memory
+            del inputs, outputs
             optimize_memory()
+            # Fallback if we get an empty response
+            if not assistant_response:
+                assistant_response = "I apologize, but I couldn't generate a response. Please try again."
             return assistant_response
         except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
     return demo
 ###############################################################################
 # Main Application Logic
 ###############################################################################
         # Load model and tokenizer
         model, tokenizer = load_model_and_tokenizer()
         # Build and launch Gradio interface
         demo = build_gradio_interface(model, tokenizer)