Spaces:

FrameRateTech
/

DS-llama-8b-instruct

Paused

App Files Files Community

FrameRateTech commited on Mar 13

Commit

89d86b2

verified ·

1 Parent(s): 1a42bbb

Update app.py

Browse files

Files changed (1) hide show

app.py +473 -102

app.py CHANGED Viewed

@@ -1,90 +1,250 @@
 # app.py
 import transformers
 import torch
 import gradio as gr
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
-    GenerationConfig
 )
 ###############################################################################
-# Debug Print Section
 ###############################################################################
-MODEL_ID = "FrameRateTech/DamageScan-llama-8b-instruct-merged"
-print("Transformers version:", transformers.__version__)
-# Attempt to load the tokenizer once just to see what happens
-try:
-    tokenizer_test = AutoTokenizer.from_pretrained(
-        MODEL_ID,
-        use_fast=False,
-        trust_remote_code=True
-    )
-    print("tokenizer_test =", tokenizer_test)
-    print("type(tokenizer_test) =", type(tokenizer_test))
-except Exception as e:
-    print("AutoTokenizer failed with exception:", e)
-    raise e
-# If it's returning False, bail out early so we don't crash below
-if tokenizer_test is False:
-    raise ValueError("AutoTokenizer returned False, meaning it failed to load properly.")
 ###############################################################################
-# 1. Load Tokenizer
 ###############################################################################
-# Now load the real tokenizer for your app
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-    use_fast=False,
-    trust_remote_code=True
-)
-# If `tokenizer` is not False, set pad_token_id if needed
-if getattr(tokenizer, "pad_token_id", None) is None:
-    tokenizer.pad_token_id = getattr(tokenizer, "eos_token_id", None)
 ###############################################################################
-# 2. Load Model
 ###############################################################################
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True
-)
-model.eval()
 ###############################################################################
-# 3. Default Generation Settings
 ###############################################################################
-default_gen_config = GenerationConfig(
-    temperature=0.7,
-    top_p=0.9,
-    do_sample=True,
-    repetition_penalty=1.1,
-    max_new_tokens=256,
-)
 ###############################################################################
-# 4. Helper: Convert Chatbot Messages to Prompt
 ###############################################################################
-def messages_to_prompt(messages):
-    conversation = ""
     for msg in messages:
-        if msg["role"] == "user":
-            conversation += f"User: {msg['content']}\n"
-        elif msg["role"] == "assistant":
-            conversation += f"Assistant: {msg['content']}\n"
-    return conversation
-###############################################################################
-# 5. Generation Function
-###############################################################################
-def predict(messages, temperature, top_p, max_new_tokens):
-    prompt_text = messages_to_prompt(messages) + "Assistant:"
     gen_config = GenerationConfig(
         temperature=temperature,
         top_p=top_p,
@@ -92,52 +252,263 @@ def predict(messages, temperature, top_p, max_new_tokens):
         repetition_penalty=1.1,
         max_new_tokens=max_new_tokens,
     )
-    with torch.no_grad():
-        inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
-        outputs = model.generate(**inputs, generation_config=gen_config)
-        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    generated_reply = full_text[len(prompt_text):].strip()
-    messages.append({"role": "assistant", "content": generated_reply})
-    return messages
 ###############################################################################
-# 6. Build the Gradio Interface
 ###############################################################################
-with gr.Blocks() as demo:
-    gr.Markdown("<h1 align='center'>DamageScan 8B Instruct Chatbot</h1>")
-    with gr.Row():
-        with gr.Column():
-            chatbot = gr.Chatbot(label="Chat History", type="messages")
-        with gr.Column():
-            gr.Markdown("### Generation Settings")
-            temperature_slider = gr.Slider(
-                minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"
-            )
-            top_p_slider = gr.Slider(
-                minimum=0.5, maximum=1.0, value=0.9, step=0.05, label="Top-p"
-            )
-            max_tokens_slider = gr.Slider(
-                minimum=64, maximum=2048, value=256, step=64, label="Max New Tokens"
             )
-    user_input = gr.Textbox(lines=1, label="Your Message", placeholder="Type here...")
-    send_btn = gr.Button("Send")
-    def user_submit(message_history, user_text, temp, top_p, max_tokens):
-        message_history.append({"role": "user", "content": user_text})
-        updated_messages = predict(message_history, temp, top_p, max_tokens)
-        return updated_messages, ""
-    send_btn.click(
-        user_submit,
-        inputs=[chatbot, user_input, temperature_slider, top_p_slider, max_tokens_slider],
-        outputs=[chatbot, user_input],
-    )
-    user_input.submit(
-        user_submit,
-        inputs=[chatbot, user_input, temperature_slider, top_p_slider, max_tokens_slider],
-        outputs=[chatbot, user_input],
-    )
-    demo.queue().launch()

 # app.py
+import os
+import gc
+import logging
+import traceback
+import time
 import transformers
 import torch
 import gradio as gr
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
+    GenerationConfig,
+    BitsAndBytesConfig
 )
 ###############################################################################
+# Configure Logging
 ###############################################################################
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger("DamageScan-App")
 ###############################################################################
+# Model Configuration
 ###############################################################################
+MODEL_ID = "FrameRateTech/DamageScan-llama-8b-instruct-merged"
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question is not clear or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
 ###############################################################################
+# Device Configuration and Memory Management
 ###############################################################################
+def get_device_info():
+    """Log information about available devices and memory"""
+    device_info = {
+        "cuda_available": torch.cuda.is_available(),
+        "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+        "mps_available": hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+    }
+    if device_info["cuda_available"] and device_info["device_count"] > 0:
+        device_info["cuda_device_name"] = torch.cuda.get_device_name(0)
+        device_info["cuda_device_mem_total"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        device_info["cuda_device_mem_reserved"] = torch.cuda.memory_reserved(0) / (1024**3)
+        device_info["cuda_device_mem_allocated"] = torch.cuda.memory_allocated(0) / (1024**3)
+    logger.info(f"Device information: {device_info}")
+    return device_info
+def optimize_memory():
+    """Optimize memory usage by clearing caches and forcing garbage collection"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+    logger.info("Memory optimized: caches cleared and garbage collected")
 ###############################################################################
+# Model Loading with Error Handling
 ###############################################################################
+def load_model_and_tokenizer():
+    """Load the model and tokenizer with comprehensive error handling and logging"""
+    logger.info(f"Loading model: {MODEL_ID}")
+    logger.info(f"Transformers version: {transformers.__version__}")
+    logger.info(f"PyTorch version: {torch.__version__}")
+    device_info = get_device_info()
+    # Determine quantization settings based on available hardware
+    load_in_4bit = False
+    load_in_8bit = False
+    if device_info["cuda_available"]:
+        # On ZEROGPU environments, 4-bit quantization helps fit the model in memory
+        load_in_4bit = True
+        logger.info("Using 4-bit quantization for CUDA device")
+    # Configure quantization if needed
+    if load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True
+        )
+        logger.info("Configured 4-bit quantization with NF4 type")
+    elif load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True
+        )
+        logger.info("Configured 8-bit quantization")
+    else:
+        quantization_config = None
+        logger.info("No quantization configured, using default precision")
+    # Step 1: Load tokenizer with detailed error logging
+    try:
+        logger.info("Loading tokenizer...")
+        tokenizer_start = time.time()
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_ID,
+            use_fast=False,
+            trust_remote_code=True
+        )
+        tokenizer_load_time = time.time() - tokenizer_start
+        logger.info(f"Tokenizer loaded successfully in {tokenizer_load_time:.2f} seconds")
+        logger.info(f"Tokenizer type: {type(tokenizer).__name__}")
+        # Log important tokenizer properties
+        tokenizer_info = {
+            "vocab_size": len(tokenizer),
+            "model_max_length": tokenizer.model_max_length,
+            "bos_token": tokenizer.bos_token,
+            "eos_token": tokenizer.eos_token,
+            "has_chat_template": hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None
+        }
+        logger.info(f"Tokenizer properties: {tokenizer_info}")
+        # Set pad token if needed
+        if getattr(tokenizer, "pad_token_id", None) is None:
+            logger.info("Pad token not found, setting pad_token_id to eos_token_id")
+            tokenizer.pad_token_id = getattr(tokenizer, "eos_token_id", None)
+    except Exception as e:
+        logger.error(f"Failed to load tokenizer: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise RuntimeError(f"Failed to load tokenizer: {str(e)}")
+    # Step 2: Load model with detailed error logging
+    try:
+        logger.info("Loading model...")
+        model_start = time.time()
+        # Determine device map strategy
+        if device_info["cuda_available"]:
+            device_map = "auto"
+            torch_dtype = torch.float16
+            logger.info("Using 'auto' device map for CUDA with float16 precision")
+        elif device_info["mps_available"]:
+            device_map = {"": "mps"}
+            torch_dtype = torch.float16
+            logger.info("Using MPS device with float16 precision")
+        else:
+            device_map = {"": "cpu"}
+            torch_dtype = torch.float32
+            logger.info("Using CPU with float32 precision")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            trust_remote_code=True,
+            quantization_config=quantization_config
+        )
+        model.eval()
+        model_load_time = time.time() - model_start
+        logger.info(f"Model loaded successfully in {model_load_time:.2f} seconds")
+        # Log model info
+        model_info = {
+            "model_type": model.config.model_type,
+            "hidden_size": model.config.hidden_size,
+            "vocab_size": model.config.vocab_size,
+            "num_hidden_layers": model.config.num_hidden_layers
+        }
+        logger.info(f"Model properties: {model_info}")
+    except Exception as e:
+        logger.error(f"Failed to load model: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise RuntimeError(f"Failed to load model: {str(e)}")
+    return model, tokenizer
 ###############################################################################
+# Chat Formatting and Generation Functions
 ###############################################################################
+def format_chat_for_model(messages, tokenizer, system_prompt=DEFAULT_SYSTEM_PROMPT):
+    """
+    Format chat messages for the model using the tokenizer's chat template if available,
+    or fall back to a manual format for Llama models.
+    """
+    logger.info(f"Formatting chat with {len(messages)} messages")
+    # Prepare messages in the correct format
+    formatted_messages = []
+    # Add system message if not already present
+    if messages and messages[0].get("role") != "system":
+        formatted_messages.append({"role": "system", "content": system_prompt})
+    # Add user and assistant messages
     for msg in messages:
+        role = msg["role"]
+        # Skip system messages if we already added one
+        if role == "system" and formatted_messages and formatted_messages[0]["role"] == "system":
+            continue
+        formatted_messages.append({"role": role, "content": msg["content"]})
+    # Use the tokenizer's built-in chat template if available
+    if hasattr(tokenizer, "apply_chat_template") and callable(tokenizer.apply_chat_template):
+        logger.info("Using tokenizer's built-in chat template")
+        try:
+            chat_text = tokenizer.apply_chat_template(
+                formatted_messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            logger.debug(f"Formatted chat using built-in template: {chat_text[:100]}...")
+            return chat_text
+        except Exception as e:
+            logger.warning(f"Failed to apply chat template: {str(e)}")
+            logger.warning("Falling back to manual formatting")
+    # Manual fallback format for Llama models
+    logger.info("Using manual chat formatting for Llama model")
+    chat_text = ""
+    for msg in formatted_messages:
+        role = msg["role"]
+        content = msg["content"]
+        if role == "system":
+            chat_text += f"<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>"
+        elif role == "user":
+            chat_text += f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>"
+        elif role == "assistant":
+            chat_text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>"
+    # Add the final assistant header for generation
+    chat_text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    logger.debug(f"Manually formatted chat: {chat_text[:100]}...")
+    return chat_text
+def generate_response(model, tokenizer, messages, temperature=0.7, top_p=0.9, max_new_tokens=256, system_prompt=DEFAULT_SYSTEM_PROMPT):
+    """Generate a response from the model with retry logic and error handling"""
+    logger.info(f"Generating response with temp={temperature}, top_p={top_p}, max_tokens={max_new_tokens}")
+    # Format the messages for the model
+    prompt = format_chat_for_model(messages, tokenizer, system_prompt)
+    # Configure generation parameters
     gen_config = GenerationConfig(
         temperature=temperature,
         top_p=top_p,
         repetition_penalty=1.1,
         max_new_tokens=max_new_tokens,
     )
+    # Tokenize the input
+    try:
+        inputs = tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        logger.info(f"Input tokenized to {inputs['input_ids'].shape[1]} tokens")
+    except Exception as e:
+        logger.error(f"Error during tokenization: {str(e)}")
+        return "I encountered an error while processing your message. Please try again."
+    # Generate with retry logic
+    max_retries = 3
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            # Run the generation
+            generation_start = time.time()
+            with torch.no_grad():
+                output_ids = model.generate(
+                    **inputs,
+                    generation_config=gen_config,
+                )
+            generation_time = time.time() - generation_start
+            logger.info(f"Generation completed in {generation_time:.2f} seconds")
+            # Decode the output
+            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+            # Extract just the assistant's response
+            assistant_response = ""
+            if hasattr(tokenizer, "apply_chat_template") and callable(tokenizer.apply_chat_template):
+                # Extract assistant's response from the full output
+                if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
+                    parts = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")
+                    if len(parts) > 1:
+                        assistant_part = parts[-1]
+                        if "<|eot_id|>" in assistant_part:
+                            assistant_response = assistant_part.split("<|eot_id|>")[0].strip()
+                        else:
+                            assistant_response = assistant_part.strip()
+                else:
+                    # Fall back to removing the prompt
+                    assistant_response = generated_text[len(prompt):].strip()
+            else:
+                # Simple extraction method
+                assistant_response = generated_text[len(prompt):].strip()
+            logger.info(f"Response extracted, length: {len(assistant_response)} chars")
+            # Free up memory
+            del inputs, output_ids
+            optimize_memory()
+            return assistant_response
+        except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+            retry_count += 1
+            logger.warning(f"Generation attempt {retry_count} failed: {str(e)}")
+            if retry_count < max_retries:
+                logger.info(f"Retrying with reduced parameters...")
+                # Reduce parameters to try to fit in memory
+                max_new_tokens = max(64, max_new_tokens // 2)
+                optimize_memory()
+            else:
+                logger.error(f"Failed to generate after {max_retries} attempts")
+                return "I'm sorry, I encountered a resource limitation while generating a response. Please try a shorter message or adjust the generation parameters."
+        except Exception as e:
+            logger.error(f"Unexpected error during generation: {str(e)}")
+            logger.error(traceback.format_exc())
+            return "I encountered an unexpected error. Please try again with different parameters."
 ###############################################################################
+# Gradio Interface
 ###############################################################################
+def build_gradio_interface(model, tokenizer):
+    """Build and launch the Gradio interface"""
+    logger.info("Building Gradio interface")
+    def user_submit(message_history, user_text, temp, top_p, max_tokens, system_message):
+        """Handle user message submission"""
+        logger.info(f"Received user message: '{user_text[:50]}...' (length: {len(user_text)})")
+        if not user_text.strip():
+            logger.warning("Empty user message, skipping processing")
+            return message_history, ""
+        try:
+            # Add user message to history
+            if not message_history:
+                # Start with system message if this is the first message
+                message_history = [{"role": "system", "content": system_message}]
+            message_history.append({"role": "user", "content": user_text})
+            # Generate response
+            assistant_response = generate_response(
+                model,
+                tokenizer,
+                message_history,
+                temperature=temp,
+                top_p=top_p,
+                max_new_tokens=max_tokens,
+                system_prompt=system_message
             )
+            # Add assistant response to history
+            message_history.append({"role": "assistant", "content": assistant_response})
+            logger.info(f"Added assistant response (length: {len(assistant_response)})")
+            # Optimize memory after generation
+            optimize_memory()
+            return message_history, ""
+        except Exception as e:
+            logger.error(f"Error in user_submit: {str(e)}")
+            logger.error(traceback.format_exc())
+            # Return original message history plus error message
+            error_msg = "I encountered an error processing your request. Please try again."
+            if not message_history:
+                message_history = []
+            message_history.append({"role": "user", "content": user_text})
+            message_history.append({"role": "assistant", "content": error_msg})
+            return message_history, ""
+    def clear_chat():
+        """Clear the chat history"""
+        logger.info("Clearing chat history")
+        optimize_memory()
+        return [], ""
+    # Define the Gradio interface
+    with gr.Blocks(css="footer {visibility: hidden}") as demo:
+        gr.Markdown("<h1 align='center'>DamageScan 8B Instruct Chatbot</h1>")
+        gr.Markdown("<p align='center'>Powered by FrameRateTech/DamageScan-llama-8b-instruct-merged</p>")
+        with gr.Row():
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(
+                    label="Chat History",
+                    height=600,
+                    avatar_images=(None, "https://huggingface.co/spaces/FrameRateTech/DamageScan-8b-instruct-chat/resolve/main/avatar.png"),
+                )
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        user_input = gr.Textbox(
+                            lines=3,
+                            label="Your Message",
+                            placeholder="Type your message here...",
+                            show_copy_button=True
+                        )
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button("Send", variant="primary")
+                        clear_btn = gr.Button("Clear Chat")
+            with gr.Column(scale=1):
+                gr.Markdown("### System Prompt")
+                system_prompt_input = gr.Textbox(
+                    lines=5,
+                    label="System Instructions",
+                    value=DEFAULT_SYSTEM_PROMPT,
+                    show_copy_button=True
+                )
+                gr.Markdown("### Generation Settings")
+                temperature_slider = gr.Slider(
+                    minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature",
+                    info="Higher values make output more random, lower values more deterministic"
+                )
+                top_p_slider = gr.Slider(
+                    minimum=0.5, maximum=1.0, value=0.9, step=0.05, label="Top-p",
+                    info="Controls diversity via nucleus sampling"
+                )
+                max_tokens_slider = gr.Slider(
+                    minimum=64, maximum=1024, value=256, step=64, label="Max New Tokens",
+                    info="Maximum length of generated response"
+                )
+                gr.Markdown("### Tips")
+                gr.Markdown("""
+                * Lower temperature (0.1-0.3) for factual responses
+                * Higher temperature (0.7-1.0) for creative tasks
+                * Reduce max tokens if responses are too long
+                * Clear chat if the model gets confused
+                """)
+        # Set up event handlers
+        submit_btn.click(
+            user_submit,
+            inputs=[chatbot, user_input, temperature_slider, top_p_slider, max_tokens_slider, system_prompt_input],
+            outputs=[chatbot, user_input],
+        )
+        user_input.submit(
+            user_submit,
+            inputs=[chatbot, user_input, temperature_slider, top_p_slider, max_tokens_slider, system_prompt_input],
+            outputs=[chatbot, user_input],
+        )
+        clear_btn.click(
+            clear_chat,
+            outputs=[chatbot, user_input]
+        )
+        # Add example prompts
+        gr.Examples(
+            examples=[
+                ["Can you explain how the Large Hadron Collider works?"],
+                ["Write a short story about a robot who learns to paint"],
+                ["What are three ways to improve productivity when working from home?"],
+                ["Explain quantum computing to me like I'm 10 years old"],
+            ],
+            inputs=user_input,
+            label="Example Prompts"
+        )
+    return demo
+###############################################################################
+# Main Application Logic
+###############################################################################
+def main():
+    """Main application entry point"""
+    try:
+        logger.info("Starting DamageScan 8B Instruct application")
+        logger.info(f"Environment: CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
+        # Load model and tokenizer
+        model, tokenizer = load_model_and_tokenizer()
+        # Build and launch Gradio interface
+        demo = build_gradio_interface(model, tokenizer)
+        # Launch the app
+        logger.info("Launching Gradio interface")
+        demo.queue().launch(
+            share=False,
+            debug=False,
+            show_error=True,
+            favicon_path="https://huggingface.co/spaces/FrameRateTech/DamageScan-8b-instruct-chat/resolve/main/favicon.ico"
+        )
+    except Exception as e:
+        logger.error(f"Application startup failed: {str(e)}")
+        logger.error(traceback.format_exc())
+        # Create a minimal fallback UI to show the error
+        with gr.Blocks() as fallback_demo:
+            gr.Markdown("# ⚠️ DamageScan 8B Application Error")
+            gr.Markdown(f"The application encountered an error during startup:\n\n```\n{str(e)}\n```")
+            gr.Markdown("Please check the logs for more details or try again later.")
+        fallback_demo.launch()
+if __name__ == "__main__":
+    main()