llava

Paused

App Files Files Community

badayvedat

liuhaotian commited on Oct 10, 2023

Commit

c6dfdac

•

1 Parent(s): 255cd6e

Load 13B model with 8-bit/4-bit quantization to support more hardwares (#2)

Browse files

- Load 13B model with 8-bit/4-bit quantization to support more hardwares (2043a67569994113ef5f4a8d0c58df57f6c2ec66)
- Update requirements.txt (45e69a6796b68457d9e0f2e7bf82cc5f7a38b2b1)
- Update app.py (4e058355a3b5dcf3470e3a49b891eb91455f030b)
- Update app.py (4ad10fb0867be1212b7746919900c9fd16014f69)

Co-authored-by: Haotian Liu <liuhaotian@users.noreply.huggingface.co>

Files changed (2) hide show

app.py +20 -3
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -325,6 +325,14 @@ title_markdown = """
 [[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
 ONLY WORKS WITH GPU!
 """
 tos_markdown = """
@@ -522,8 +530,12 @@ def start_controller():
     return subprocess.Popen(controller_command)
-def start_worker(model_path: str):
     logger.info(f"Starting the model worker for the model {model_path}")
     worker_command = [
         "python",
         "-m",
@@ -534,7 +546,11 @@ def start_worker(model_path: str):
         "http://localhost:10000",
         "--model-path",
         model_path,
     ]
     return subprocess.Popen(worker_command)
@@ -582,12 +598,13 @@ if __name__ == "__main__":
     args = get_args()
     logger.info(f"args: {args}")
-    model_path = "liuhaotian/llava-v1.5-7b"
     preload_models(model_path)
     controller_proc = start_controller()
-    worker_proc = start_worker(model_path)
     # Wait for worker and controller to start
     time.sleep(10)

 [[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
 ONLY WORKS WITH GPU!
+You can load the model with 8-bit or 4-bit quantization to make it fit in smaller hardwares. Setting the environment variable `bits` to control the quantization.
+Recommended configurations:
+| Hardware          | A10G-Large (24G) | T4-Medium (15G) | A100-Large (40G) |
+|-------------------|------------------|-----------------|------------------|
+| **Bits**          | 8 (default)      | 4               | 16               |
 """
 tos_markdown = """
     return subprocess.Popen(controller_command)
+def start_worker(model_path: str, bits=16):
     logger.info(f"Starting the model worker for the model {model_path}")
+    model_name = model_path.strip('/').split('/')[-1]
+    assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
+    if bits != 16:
+        model_name += f'-{bits}bit'
     worker_command = [
         "python",
         "-m",
         "http://localhost:10000",
         "--model-path",
         model_path,
+        "--model-name",
+        model_name,
     ]
+    if bits != 16:
+        worker_command += [f'--load-{bits}bit']
     return subprocess.Popen(worker_command)
     args = get_args()
     logger.info(f"args: {args}")
+    model_path = "liuhaotian/llava-v1.5-13b"
+    bits = int(os.getenv("bits", 8))
     preload_models(model_path)
     controller_proc = start_controller()
+    worker_proc = start_worker(model_path, bits=bits)
     # Wait for worker and controller to start
     time.sleep(10)

requirements.txt CHANGED Viewed

@@ -8,8 +8,8 @@ numpy
 requests
 sentencepiece
 tokenizers>=0.12.1
-torch
-torchvision
 uvicorn
 wandb
 shortuuid

 requests
 sentencepiece
 tokenizers>=0.12.1
+torch==2.0.1
+torchvision==0.15.2
 uvicorn
 wandb
 shortuuid