liuhaotian
commited on
Commit
•
7564980
1
Parent(s):
c5721f2
Add flash attention
Browse files- app.py +7 -6
- requirements.txt +2 -2
app.py
CHANGED
@@ -40,6 +40,7 @@ def start_worker(model_path: str, bits=16):
|
|
40 |
model_path,
|
41 |
"--model-name",
|
42 |
model_name,
|
|
|
43 |
]
|
44 |
if bits != 16:
|
45 |
worker_command += [f"--load-{bits}bit"]
|
@@ -65,12 +66,12 @@ if __name__ == "__main__":
|
|
65 |
ONLY WORKS WITH GPU! By default, we load the model with 4-bit quantization to make it fit in smaller hardwares. Set the environment variable `bits` to control the quantization.
|
66 |
|
67 |
Set the environment variable `model` to change the model, and switch hardware accordingly:
|
68 |
-
| Model
|
69 |
-
|
70 |
-
| liuhaotian/llava-v1.6-mistral-7b | T4
|
71 |
-
| liuhaotian/llava-v1.6-vicuna-7b | T4
|
72 |
-
| liuhaotian/llava-v1.6-vicuna-13b | T4
|
73 |
-
| liuhaotian/llava-v1.6-34b
|
74 |
"""
|
75 |
|
76 |
print(f"args: {gws.args}")
|
|
|
40 |
model_path,
|
41 |
"--model-name",
|
42 |
model_name,
|
43 |
+
"--use-flash-attn",
|
44 |
]
|
45 |
if bits != 16:
|
46 |
worker_command += [f"--load-{bits}bit"]
|
|
|
66 |
ONLY WORKS WITH GPU! By default, we load the model with 4-bit quantization to make it fit in smaller hardwares. Set the environment variable `bits` to control the quantization.
|
67 |
|
68 |
Set the environment variable `model` to change the model, and switch hardware accordingly:
|
69 |
+
| Model | Hardware |
|
70 |
+
|-----------------------------------|------------|
|
71 |
+
| liuhaotian/llava-v1.6-mistral-7b | T4 small |
|
72 |
+
| liuhaotian/llava-v1.6-vicuna-7b | T4 small |
|
73 |
+
| liuhaotian/llava-v1.6-vicuna-13b | T4 small |
|
74 |
+
| liuhaotian/llava-v1.6-34b | A10G large |
|
75 |
"""
|
76 |
|
77 |
print(f"args: {gws.args}")
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
llava-torch
|
2 |
-
|
|
|
1 |
+
llava-torch
|
2 |
+
flash-attn
|