Spaces:

n00b001
/

llm-compressor-my-repo

Sleeping

File size: 16,248 Bytes

#!/usr/bin/env python
"""
Specialized script for quantizing Qwen2.5-VL models with sequential onloading
Handles quantization of Qwen2_5_VLForConditionalGeneration models properly
"""

import base64
from io import BytesIO
from typing import Optional, Union, Dict, Any
import torch
from datasets import load_dataset
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
from llmcompressor.utils import dispatch_for_generation


def create_qwen2_5_vl_data_collator():
    """Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
    def data_collator(batch):
        assert len(batch) == 1
        return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value 
                for key, value in batch[0].items()}
    return data_collator


def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
    """Create a preprocessing function for Qwen2.5-VL datasets."""
    def preprocess_and_tokenize(example):
        # Handle different image formats
        if 'image' in example:
            # Process image
            if hasattr(example['image'], 'save'):
                # PIL Image object
                buffered = BytesIO()
                example["image"].save(buffered, format="PNG")
                encoded_image = base64.b64encode(buffered.getvalue())
                encoded_image_text = encoded_image.decode("utf-8")
                base64_qwen = f"data:image;base64,{encoded_image_text}"
            else:
                # Already a string or other format
                base64_qwen = str(example["image"])
        else:
            # If there's no image field, try 'img' or similar
            img_key = None
            for key in example.keys():
                if 'image' in key.lower() or 'img' in key.lower():
                    img_key = key
                    break
            if img_key:
                if hasattr(example[img_key], 'save'):
                    buffered = BytesIO()
                    example[img_key].save(buffered, format="PNG")
                    encoded_image = base64.b64encode(buffered.getvalue())
                    encoded_image_text = encoded_image.decode("utf-8")
                    base64_qwen = f"data:image;base64,{encoded_image_text}"
                else:
                    base64_qwen = str(example[img_key])
            else:
                # If no image, create a simple text-only example
                messages = [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
                        ],
                    }
                ]
                text = processor.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                
                return processor(
                    text=[text],
                    padding=False,
                    max_length=max_sequence_length,
                    truncation=True,
                )

        # Create message with image
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": base64_qwen},
                    {"type": "text", "text": "What does the image show?"},
                ],
            }
        ]
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)

        # tokenize
        return processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=False,
            max_length=max_sequence_length,
            truncation=True,
        )
    
    return preprocess_and_tokenize


def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
    """
    Creates the appropriate quantization recipe for Qwen2.5-VL models.
    
    Args:
        method: Quantization method ("GPTQ", "AWQ", or "FP8")
        scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
    
    Returns:
        List of modifiers for the quantization recipe
    """
    if method == "GPTQ":
        return [
            GPTQModifier(
                targets="Linear",
                scheme=scheme,
                ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
                sequential_targets=["Qwen2_5_VLDecoderLayer"],  # This is key for the architecture
            ),
        ]
    elif method == "AWQ":
        # Create AWQ mappings for Qwen2.5-VL architecture
        mappings = [
            AWQMapping(
                "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
            ),
            AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
            AWQMapping(
                "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
            ),
            AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
        ]
        return [
            AWQModifier(
                ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
                scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
                targets=["Linear"],
                mappings=mappings,
            ),
        ]
    elif method == "FP8":
        return [
            QuantizationModifier(
                scheme="FP8", 
                targets="Linear", 
                ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
            )
        ]
    else:
        raise ValueError(f"Unsupported quantization method: {method}")


def quantize_qwen2_5_vl_model(
    model_id: str,
    quantization_method: str,
    output_dir: Optional[str] = None,
    dataset_id: str = "lmms-lab/flickr30k",
    dataset_split: str = "test[:512]",
    num_calibration_samples: int = 512,
    max_sequence_length: int = 2048,
    scheme: str = "W4A16",
    trust_remote_code: bool = True,
):
    """
    Quantizes a Qwen2.5-VL model with proper architecture handling and sequential onloading.
    
    Args:
        model_id: Hugging Face model ID to quantize
        quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
        output_dir: Directory to save the quantized model
        dataset_id: Dataset ID for calibration
        dataset_split: Dataset split for calibration
        num_calibration_samples: Number of samples to use for calibration
        max_sequence_length: Maximum sequence length for processing
        scheme: Quantization scheme (e.g., "W4A16", "W8A8")
        trust_remote_code: Whether to trust remote code in model loading
    
    Returns:
        Quantized model
    """
    print(f"Loading model: {model_id}")

    # Handle different device scenarios properly
    if torch.cuda.is_available():
        try:
            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                model_id,
                torch_dtype=torch.float16,  # Use float16 to save memory on GPU
                device_map="auto",  # Auto device mapping for memory efficiency
                trust_remote_code=trust_remote_code
            )
        except RuntimeError as e:
            if "out of memory" in str(e).lower() or "offload_dir" in str(e):
                print(f"Memory issue detected, using offloading: {e}")
                import tempfile
                with tempfile.TemporaryDirectory() as temp_dir:
                    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                        model_id,
                        torch_dtype=torch.float16,
                        device_map="auto",
                        offload_folder=temp_dir,
                        max_memory={0: "24GB", "cpu": "48GB"},
                        trust_remote_code=trust_remote_code
                    )
            else:
                raise
    else:
        # CPU only
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.float32,  # Use float32 on CPU
            device_map="cpu",
            trust_remote_code=trust_remote_code
        )
    
    print(f"Loading processor for: {model_id}")
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
    
    # If output directory not specified, create one based on model and method
    if not output_dir:
        model_name = model_id.rstrip("/").split("/")[-1]
        output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
    
    print(f"Output directory: {output_dir}")
    
    # Load dataset and preprocess
    print(f"Loading dataset: {dataset_id}")
    try:
        ds = load_dataset(dataset_id, split=dataset_split)
    except Exception as e:
        print(f"Failed to load {dataset_id}, trying alternative text-only dataset: {e}")
        # If the image dataset fails, try a text-only dataset
        ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:512]")
        # We'll need to adjust preprocessing for text-only data

    ds = ds.shuffle(seed=42)
    
    # Apply preprocessing
    preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
    try:
        ds = ds.map(preprocess_fn, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
    except Exception as e:
        print(f"Preprocessing failed: {e}")
        print("Trying simpler preprocessing with text-only data...")
        # Fallback: use text-only preprocessing
        def text_only_preprocess(example):
            text = example.get('text', example.get('content', str(example)))
            if not isinstance(text, str):
                text = str(text)
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": text[:500] + "..." if len(text) > 500 else text},  # Limit length
                    ],
                }
            ]
            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
        
        ds = ds.map(text_only_preprocess, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
    
    # Define data collator
    data_collator = create_qwen2_5_vl_data_collator()
    
    # Create recipe
    recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
    
    print(f"Starting quantization with method: {quantization_method}")
    print(f"Using recipe: {recipe}")
    
    # Perform oneshot quantization with sequential targets and proper handling
    oneshot(
        model=model,
        tokenizer=processor,  # Use processor as tokenizer for Qwen2.5-VL
        dataset=ds,
        recipe=recipe,
        max_seq_length=max_sequence_length,
        num_calibration_samples=num_calibration_samples,
        trust_remote_code_model=trust_remote_code,
        data_collator=data_collator,
        # Use sequential onloading for memory efficiency
        sequential_targets=["Qwen2_5_VLDecoderLayer"],
        save_compressed=True,
        output_dir=output_dir,
    )
    
    print(f"Quantization completed! Model saved to: {output_dir}")
    
    # Save the processor as well
    processor.save_pretrained(output_dir)
    
    return model


def test_quantized_model(model, processor, max_sequence_length: int = 2048):
    """
    Tests the quantized model with a sample generation.
    """
    print("========== SAMPLE GENERATION ==============")
    try:
        dispatch_for_generation(model)
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": "http://images.cocodataset.org/train2017/000000231895.jpg",
                    },
                    {"type": "text", "text": "Please describe the animal in this image\n"},
                ],
            }
        ]
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[prompt],
            images=image_inputs,
            videos=video_inputs,
            padding=False,
            max_length=max_sequence_length,
            truncation=True,
            return_tensors="pt",
        ).to(model.device)
        
        output = model.generate(**inputs, max_new_tokens=100)
        result = processor.decode(output[0], skip_special_tokens=True)
        print(result)
        print("==========================================")
        return result
    except Exception as e:
        print(f"Test generation failed: {e}")
        print("Trying text-only generation...")
        # Try with text-only
        try:
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Hello, how are you today?"},
                    ],
                }
            ]
            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
            inputs = processor(
                text=[prompt],
                padding=False,
                max_length=max_sequence_length,
                truncation=True,
                return_tensors="pt",
            ).to(model.device)
            
            output = model.generate(**inputs, max_new_tokens=50)
            result = processor.decode(output[0], skip_special_tokens=True)
            print(result)
            print("==========================================")
            return result
        except Exception as e2:
            print(f"Text-only generation also failed: {e2}")
            return None


def main():
    """
    Main function to demonstrate quantization of Qwen2.5-VL models.
    """
    import argparse
    
    parser = argparse.ArgumentParser(description="Quantize Qwen2.5-VL models")
    parser.add_argument("--model_id", type=str, required=True, 
                        help="Model ID to quantize (e.g., 'huihui-ai/Huihui-Fara-7B-abliterated')")
    parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"], 
                        default="GPTQ", help="Quantization method to use")
    parser.add_argument("--output_dir", type=str, default=None, 
                        help="Output directory for quantized model")
    parser.add_argument("--dataset_id", type=str, default="lmms-lab/flickr30k",
                        help="Dataset for calibration (default: lmms-lab/flickr30k)")
    parser.add_argument("--scheme", type=str, default="W4A16", 
                        help="Quantization scheme (e.g., W4A16, W8A8)")
    parser.add_argument("--num_samples", type=int, default=128, 
                        help="Number of calibration samples")
    
    args = parser.parse_args()
    
    print(f"Starting quantization of {args.model_id} using {args.method}")
    
    try:
        # Quantize the model
        quantized_model = quantize_qwen2_5_vl_model(
            model_id=args.model_id,
            quantization_method=args.method,
            output_dir=args.output_dir,
            dataset_id=args.dataset_id,
            num_calibration_samples=args.num_samples,
            scheme=args.scheme
        )
        
        # Test the model
        # Load the processor again to test
        processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
        test_quantized_model(quantized_model, processor)
        
        print(f"Successfully quantized {args.model_id} with {args.method}")
        
    except Exception as e:
        print(f"Quantization failed: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()