File size: 2,924 Bytes
ad6330a
 
 
ef08154
ad6330a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bf0b67
ad6330a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import subprocess
import os, requests
import torch, torchvision
import spaces
from huggingface_hub import login
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, LlavaNextForConditionalGeneration, LlavaForConditionalGeneration, PaliGemmaForConditionalGeneration, Idefics2ForConditionalGeneration

# Install required package
def install_flash_attn():
    subprocess.run(
        "pip install flash-attn --no-build-isolation",
        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
        shell=True,
    )

# Authenticate with Hugging Face
def authenticate_hf(token):
    login(token=token, add_to_git_credential=True)

# Function to get the model summary
model_cache = {}

@spaces.GPU
def get_model_summary(model_name):
    if model_name in model_cache:
        return model_cache[model_name], ""

    try:
        # Fetch the config.json file
        config_url = f"https://huggingface.co/{model_name}/raw/main/config.json"
        headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
        response = requests.get(config_url, headers=headers)
        response.raise_for_status()
        config = response.json()
        architecture = config["architectures"][0]

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Check if the model is quantized
        is_quantized = "quantized" in model_name.lower()

        # Set up BitsAndBytesConfig if the model is quantized
        bnb_config = BitsAndBytesConfig(load_in_4bit=True) if is_quantized else None

        # Load the model based on its architecture and quantization status
        if architecture == "LlavaNextForConditionalGeneration":
            model = LlavaNextForConditionalGeneration.from_pretrained(
                model_name, config=bnb_config, trust_remote_code=True
            )
        elif architecture == "LlavaForConditionalGeneration":
            model = LlavaForConditionalGeneration.from_pretrained(
                model_name, config=bnb_config, trust_remote_code=True
            )
        elif architecture == "PaliGemmaForConditionalGeneration":
            model = PaliGemmaForConditionalGeneration.from_pretrained(
                model_name, config=bnb_config, trust_remote_code=True
            )
        elif architecture == "Idefics2ForConditionalGeneration":
            model = Idefics2ForConditionalGeneration.from_pretrained(
                model_name, config=bnb_config, trust_remote_code=True
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name, config=bnb_config, trust_remote_code=True
            )

        # Move to device only if the model is not quantized
        if not is_quantized:
            model = model.to(device)

        model_summary = str(model)
        model_cache[model_name] = model_summary
        return model_summary, ""
    except Exception as e:
        return "", str(e)