File size: 8,328 Bytes
76398c6
5c9514a
76398c6
 
 
 
a0b9dac
76398c6
992ded3
76398c6
5c9514a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
992ded3
76398c6
 
 
 
 
 
 
 
 
 
 
 
 
 
fddae32
 
 
 
76398c6
d37299b
ae0011e
 
d37299b
 
 
 
 
 
 
 
 
 
 
 
 
 
ae0011e
d37299b
 
 
 
 
 
 
 
ae0011e
 
76398c6
 
 
 
 
0cc3d3a
 
76398c6
 
e8be103
 
 
 
 
 
 
 
 
 
 
 
76398c6
5c9514a
 
 
76398c6
 
 
 
 
 
a0b9dac
 
 
 
fddae32
 
 
 
 
 
 
a0b9dac
76398c6
 
d37299b
79399de
161a324
79399de
 
76398c6
 
 
161a324
 
e8be103
 
 
161a324
76398c6
 
 
 
 
2d9aa2d
76398c6
a0b9dac
74c26d6
76398c6
 
 
 
 
 
 
 
 
 
d37299b
 
 
 
76398c6
0cc3d3a
ae0011e
 
 
 
 
d37299b
 
 
ae0011e
d37299b
76398c6
 
d37299b
 
 
 
76398c6
 
 
d37299b
76398c6
 
 
ae0011e
d37299b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import streamlit as st
from streamlit_datalist import stDatalist
import pandas as pd
from utils import extract_from_url, get_model, calculate_memory
import plotly.express as px
import numpy as np
import gc

st.set_page_config(page_title='Can you run it? LLM version', layout="wide", initial_sidebar_state="expanded")

model_list = [
    "mistralai/Mistral-7B-v0.1",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "ehartford/samantha-mistral-7b",
    "SkunkworksAI/Mistralic-7B-1",
    "microsoft/phi-1_5",
    "PY007/TinyLlama-1.1B-intermediate-step-480k-1T"
    "codellama/CodeLlama-7b-hf",
    "codellama/CodeLlama-13b-hf",
    "codellama/CodeLlama-34b-hf",
    "Phind/Phind-CodeLlama-34B-v2",
    "WizardLM/WizardCoder-Python-34B-V1.0",
    "TheBloke/Llama-2-7B-fp16",
    "TheBloke/Llama-2-13B-fp16",
    "TheBloke/Llama-2-70B-fp16",
    "Gryphe/MythoMax-L2-13b",
    "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b",
    "lmsys/vicuna-7b-v1.5",
    "lmsys/vicuna-13b-v1.5-16k",
    "lmsys/longchat-7b-v1.5-32k",
    "tiiuae/falcon-7B-Instruct",
    "tiiuae/falcon-7B",
    "tiiuae/falcon-40B",
    "tiiuae/falcon-40B-Instruct",
    "tiiuae/falcon-180B",
    "tiiuae/falcon-180B-Chat",
]
st.title("Can you run it? LLM version")

percentage_width_main = 80
st.markdown(
        f"""<style>
        .appview-container .main .block-container{{
        max-width: {percentage_width_main}%;}}
        </style>
        """,
        unsafe_allow_html=True,
    )
@st.cache_resource
def get_gpu_specs():
    return pd.read_csv("data/gpu_specs.csv")

@st.cache_resource
def get_mistralai_table():
    model = get_model("mistralai/Mistral-7B-v0.1", library="transformers", access_token="")
    return calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])

def show_gpu_info(info, trainable_params=0, vendor=""):
    for var in ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']:
        _info = info.loc[var]
        if vendor != "Apple":
            if _info['Number of GPUs'] >= 3:
                func = st.error
                icon = "⛔"
            elif _info['Number of GPUs'] == 2:
                func = st.warning
                icon = "⚠️"
            else:
                func = st.success
                icon = "✅"
            
            msg = f"You require **{_info['Number of GPUs']}** GPUs for **{var}**"
            if var == 'LoRa Fine-tuning':
                msg += f" ({trainable_params}%)"
        else:
            if _info['Number of GPUs']==1:
                msg = f"You can run **{var}**"
                func = st.success
                icon = "✅"
            else:
                msg = f"You cannot run **{var}**"
                func = st.error
                icon = "⛔"
        func(msg, icon=icon)


def get_name(index):
    row = gpu_specs.iloc[index]
    return f"{row['Product Name']} ({row['RAM (GB)']} GB, {row['Year']})"

def custom_ceil(a, precision=0):
    return np.round(a + 0.5 * 10**(-precision), precision)
gpu_specs = get_gpu_specs()

_, col, _ = st.columns([1,3,1])
with col.expander("Information", expanded=True):
    st.markdown("""- GPU information comes from [TechPowerUp GPU Specs](https://www.techpowerup.com/gpu-specs/)
- Mainly based on [Model Memory Calculator by hf-accelerate](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
    using `transformers` library
- Inference is calculated following [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/),
    where is estimated as """)
    
    st.latex(r"""\text{Memory}_\text{Inference} \approx \text{Model Size} \times 1.2""")
    st.markdown("""- For LoRa Fine-tuning, I'm asuming a **16-bit** dtype of trainable parameters. The formula (in terms of GB) is""")
    st.latex(r"\text{Memory}_\text{LoRa} \approx \text{Model Size} + \left(\text{ \# trainable Params}_\text{Billions}\times\frac{16}{8} \times 4\right) \times 1.2")

access_token = st.sidebar.text_input("Access token")
#model_name = st.sidebar.text_input("Model name", value="mistralai/Mistral-7B-v0.1")
with st.sidebar.container():
    model_name  = stDatalist("Model name (Press Enter to apply)", model_list, index=0)
if not model_name:
    st.info("Please enter a model name")
    st.stop()

model_name = extract_from_url(model_name)
if model_name not in st.session_state:
    if 'actual_model' in st.session_state:
        del st.session_state[st.session_state['actual_model']]
        del st.session_state['actual_model']
        gc.collect()
    if model_name == "mistralai/Mistral-7B-v0.1": # cache Mistral
        st.session_state[model_name] = get_mistralai_table()
    else:
        model = get_model(model_name, library="transformers", access_token=access_token)
        st.session_state[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
        del model
        gc.collect()
    st.session_state['actual_model'] = model_name


gpu_vendor = st.sidebar.selectbox("GPU Vendor", ["NVIDIA", "AMD", "Intel", "Apple"])
# year = st.sidebar.selectbox("Filter by Release Year", list(range(2014, 2024))[::-1], index=None)
gpu_info = gpu_specs[gpu_specs['Vendor'] == gpu_vendor].sort_values('Product Name')
# if year:
#     gpu_info = gpu_info[gpu_info['Year'] == year]

min_ram = gpu_info['RAM (GB)'].min()
max_ram = gpu_info['RAM (GB)'].max()
ram = st.sidebar.slider("Filter by RAM (GB)", min_ram, max_ram, (10.0, 40.0), step=0.5)
gpu_info = gpu_info[gpu_info["RAM (GB)"].between(ram[0], ram[1])]
if len(gpu_info) == 0:
    st.sidebar.error(f"**{gpu_vendor}** has no GPU in that RAM range")
    st.stop()
gpu = st.sidebar.selectbox("GPU", gpu_info['Product Name'].index.tolist(), format_func=lambda x : gpu_specs.iloc[x]['Product Name'])
gpu_spec = gpu_specs.iloc[gpu]
gpu_spec.name = 'INFO'

lora_pct = st.sidebar.slider("LoRa % trainable parameters", 0.1, 100.0, 2.0, step=0.1)

st.sidebar.dataframe(gpu_spec.T.astype(str))

memory_table = pd.DataFrame(st.session_state[model_name]).set_index('dtype')
memory_table['LoRA Fine-Tuning (GB)'] = (memory_table["Total Size (GB)"] + 
                                          (memory_table["Parameters (Billion)"]* lora_pct/100 * (16/8)*4)) * 1.2
    
_memory_table = memory_table.copy()
memory_table = memory_table.round(2).T
_memory_table /= gpu_spec['RAM (GB)']
_memory_table = _memory_table.apply(np.ceil).astype(int).drop(columns=['Parameters (Billion)', 'Total Size (GB)'])
_memory_table.columns = ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']
_memory_table = _memory_table.stack().reset_index()
_memory_table.columns = ['dtype', 'Variable', 'Number of GPUs']
col1, col2 = st.columns([1,1.3])

if gpu_vendor == "Apple":
    col.warning("""For M1/M2 Apple chips, PyTorch uses [Metal Performance Shaders (MPS)](https://huggingface.co/docs/accelerate/usage_guides/mps) as backend.\\
Remember that Apple M1/M2 chips share memory between CPU and GPU.""", icon="⚠️")
with col1:
    st.write(f"####  [{model_name}](https://huggingface.co/{model_name}) ({custom_ceil(memory_table.iloc[3,0],1):.1f}B)")

    dtypes = memory_table.columns.tolist()[::-1]
    tabs = st.tabs(dtypes)
    for dtype, tab in zip(dtypes, tabs):
        with tab:
            if dtype in ["int4", "int8"]:
                _dtype = dtype.replace("int", "")
                st.markdown(f"`int{_dtype}` refers to models in `GPTQ-{_dtype}bit`, `AWQ-{_dtype}bit` or `Q{_dtype}_0 GGUF/GGML`")
            info = _memory_table[_memory_table['dtype'] == dtype].set_index('Variable')
            show_gpu_info(info, lora_pct, gpu_vendor)
    st.write(memory_table.iloc[[0, 1, 2, 4]])
with col2:
    extra = ""
    if gpu_vendor == "Apple":
        st.warning("This graph is irrelevant for M1/M2 chips as they can't run in parallel.", icon="⚠️")
        extra = "⚠️"
    num_colors= 4
    colors = [px.colors.sequential.RdBu[int(i*(len(px.colors.sequential.RdBu)-1)/(num_colors-1))] for i in range(num_colors)]
    fig = px.bar(_memory_table, x='Variable', y='Number of GPUs', color='dtype', barmode='group', color_discrete_sequence=colors)
    fig.update_layout(title=dict(text=f"{extra} Number of GPUs required for<br> {get_name(gpu)}", font=dict(size=25))
                    , xaxis_tickfont_size=14, yaxis_tickfont_size=16, yaxis_dtick='1')
    st.plotly_chart(fig, use_container_width=True)