File size: 4,133 Bytes
509a107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
"""
Use FastVLM-1.5B - The smaller variant that works with limited RAM
This model requires only ~3GB RAM and maintains good performance
"""

import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM

# Use the smaller FastVLM model
MID = "apple/FastVLM-1.5B"  # Smaller model - only 1.5B parameters
IMAGE_TOKEN_INDEX = -200

def load_fastvlm_small():
    """Load FastVLM-1.5B which works with limited RAM"""
    print("Loading FastVLM-1.5B (optimized for limited RAM)...")
    print("This model requires only ~3GB RAM\n")
    
    # Load tokenizer
    print("1. Loading tokenizer...")
    tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
    print(f"   ✓ Tokenizer loaded")
    
    # Determine device
    if torch.cuda.is_available():
        device = "cuda"
        dtype = torch.float16
    elif torch.backends.mps.is_available():
        device = "mps"  
        dtype = torch.float16
    else:
        device = "cpu"
        dtype = torch.float32
    
    print(f"\n2. Loading model on {device}...")
    print("   This will download ~3GB on first run...")
    
    # Load model with memory optimization
    model = AutoModelForCausalLM.from_pretrained(
        MID,
        torch_dtype=dtype,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )
    
    # Move to device
    model = model.to(device)
    model.eval()
    
    print(f"   ✓ FastVLM-1.5B loaded successfully!")
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"   ✓ Parameters: {total_params / 1e9:.2f}B")
    
    return model, tok, device

def test_generation(model, tok, device):
    """Test the model with a sample image"""
    print("\n3. Testing generation...")
    
    # Create test image
    test_image = Image.new('RGB', (336, 336), color='blue')
    
    # Prepare prompt
    messages = [
        {"role": "user", "content": "<image>\nDescribe this image."}
    ]
    
    # Apply chat template
    rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    pre, post = rendered.split("<image>", 1)
    
    # Tokenize
    pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
    post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
    img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
    input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(device)
    
    # Process image (simplified for testing)
    from torchvision import transforms
    transform = transforms.Compose([
        transforms.Resize((336, 336)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], 
                           std=[0.26862954, 0.26130258, 0.27577711])
    ])
    pixel_values = transform(test_image).unsqueeze(0).to(device)
    
    print("   Generating response...")
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            inputs=input_ids,
            pixel_values=pixel_values,
            max_new_tokens=50,
            temperature=0.7,
            do_sample=True
        )
    
    # Decode
    response = tok.decode(outputs[0], skip_special_tokens=True)
    print(f"   Response: {response[:100]}...")
    print("\n✅ FastVLM-1.5B is working correctly!")

if __name__ == "__main__":
    print("="*60)
    print("FastVLM-1.5B - Optimized for Limited RAM")
    print("="*60)
    print()
    
    try:
        model, tok, device = load_fastvlm_small()
        test_generation(model, tok, device)
        
        print("\n" + "="*60)
        print("SUCCESS: FastVLM-1.5B is ready for use!")
        print("="*60)
        print("\nThis smaller model:")
        print("• Uses only ~3GB RAM")
        print("• Maintains good performance")
        print("• Works on your system")
        print("• Has same API as FastVLM-7B")
        
    except Exception as e:
        print(f"\n✗ Error: {e}")
        print("\nEven FastVLM-1.5B failed to load.")
        print("Please close other applications and try again.")