LLaVA-Ortho

Sleeping

App Files Files Community

Nick Vandal commited on Feb 25

Commit

6c420e0

•

1 Parent(s): 0c371b7

added mupliple models and revisions

Browse files

Files changed (2) hide show

LLaVA +1 -1
app.py +20 -4

LLaVA CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~3e83206ab58f79936da2742c85c93dfd3890451c~~


1	+ Subproject commit 3c2f6ba15ed0477f4149fd582d2b640e19da2a57

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ def start_controller():
     return subprocess.Popen(controller_command)
-def start_worker(model_path: str, bits=16):
     print(f"Starting the model worker for the model {model_path}")
     model_name = model_path.strip("/").split("/")[-1]
     assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
@@ -37,6 +37,10 @@ def start_worker(model_path: str, bits=16):
         "llava.serve.model_worker",
         "--host",
         "0.0.0.0",
         "--controller",
         "http://localhost:10000",
         "--model-path",
@@ -44,6 +48,8 @@ def start_worker(model_path: str, bits=16):
         "--model-name",
         model_name,
         "--use-flash-attn",
     ]
     if bits != 16:
         worker_command += [f"--load-{bits}bit"]
@@ -77,12 +83,21 @@ Set the environment variable `model` to change the model:
     print(f"args: {gws.args}")
-    model_path = os.getenv("model", "liuhaotian/llava-v1.6-mistral-7b")
     bits = int(os.getenv("bits", 4))
     concurrency_count = int(os.getenv("concurrency_count", 5))
     controller_proc = start_controller()
-    worker_proc = start_worker(model_path, bits=bits)
     # Wait for worker and controller to start
     time.sleep(10)
@@ -103,7 +118,8 @@ Set the environment variable `model` to change the model:
         print(e)
         exit_status = 1
     finally:
-        worker_proc.kill()
         controller_proc.kill()
         sys.exit(exit_status)

     return subprocess.Popen(controller_command)
+def start_worker(model_path: str, bits=16, revision='main', port=21002):
     print(f"Starting the model worker for the model {model_path}")
     model_name = model_path.strip("/").split("/")[-1]
     assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
         "llava.serve.model_worker",
         "--host",
         "0.0.0.0",
+        "--port",
+        port,
+        "--worker-address",
+        f"http://127.0.0.1:{port}",
         "--controller",
         "http://localhost:10000",
         "--model-path",
         "--model-name",
         model_name,
         "--use-flash-attn",
+        "--revision",
+        revision
     ]
     if bits != 16:
         worker_command += [f"--load-{bits}bit"]
     print(f"args: {gws.args}")
+    model_paths = os.getenv("model", "nvandal/LLaVA-Med-v1.5-7b")
+    revisions = os.getenv("revision", "main")
     bits = int(os.getenv("bits", 4))
     concurrency_count = int(os.getenv("concurrency_count", 5))
     controller_proc = start_controller()
+    start_worker_port = 21002
+    model_paths = model_paths.split(';')
+    revisions = revisions.split(';')
+    assert(len(model_paths)==len(revisions))
+    worker_proc = [None]*len(model_paths)
+    for i, (model_path, revision) in enumerate(zip(model_paths,revisions)):
+        print(model_path, revision)
+        worker_proc[i] = start_worker(model_path, bits=bits, revision=revision, port=str(start_worker_port+i))
     # Wait for worker and controller to start
     time.sleep(10)
         print(e)
         exit_status = 1
     finally:
+        for w in worker_proc:
+            w.kill()
         controller_proc.kill()
         sys.exit(exit_status)