Spaces:

li-qing
/

FIRE

Runtime error

App Files Files Community

li-qing commited on Jul 10

Commit

59a40c7

•

1 Parent(s): a09be9c

feat: fire model

Browse files

Files changed (10) hide show

assets/chart.png +0 -0
assets/clver_1.jpg +0 -0
assets/fire_logo.png +0 -0
src/__pycache__/conversation.cpython-310.pyc +0 -0
src/conversation.py +2 -2
src/model/model_llava.py +33 -4
src/serve/__pycache__/gradio_block_arena_vision_named.cpython-310.pyc +0 -0
src/serve/__pycache__/gradio_web_server.cpython-310.pyc +0 -0
src/serve/gradio_block_arena_vision_named.py +7 -4
src/serve/gradio_web_server.py +7 -5

assets/chart.png ADDED Viewed

assets/clver_1.jpg ADDED Viewed

assets/fire_logo.png ADDED Viewed

src/__pycache__/conversation.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/conversation.cpython-310.pyc and b/src/__pycache__/conversation.cpython-310.pyc differ

src/conversation.py CHANGED Viewed

@@ -2091,8 +2091,8 @@ register_conv_template(
 conv_llava_llama_3 = Conversation(
     name="llava-original",
     system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
-    roles=("user\n\n",
-           "assistant\n\n"),
     # version="llama3",
     messages=[],
     offset=0,

 conv_llava_llama_3 = Conversation(
     name="llava-original",
     system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+    roles=("user",
+           "assistant"),
     # version="llama3",
     messages=[],
     offset=0,

src/model/model_llava.py CHANGED Viewed

@@ -26,14 +26,16 @@ def load_llava_model(lora_checkpoint=None):
             model_path, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
     else:
         tokenizer, model, image_processor, max_length = load_pretrained_model(
-            lora_checkpoint, model_path, model_name, device_map=device_map)
     model.eval()
     model.tie_weights()
     return tokenizer, model, image_processor, conv_template
-tokenizer_llava, model_llava, image_processor_llava, conv_template_llava = load_llava_model("checkpoints")
 @spaces.GPU
 def inference():
     image = Image.open("assets/example.jpg").convert("RGB")
@@ -77,7 +79,7 @@ def inference_by_prompt_and_images(prompt, images):
     image_tensor = image_tensor.to(dtype=torch.float16, device=device)
     input_ids = tokenizer_image_token(prompt, tokenizer_llava, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
     image_sizes = [image.size for image in images]
-    logger.info("Shape: {};{}",input_ids.shape, image_tensor.shape)
     with torch.inference_mode():
         cont = model_llava.generate(
             input_ids,
@@ -92,5 +94,32 @@ def inference_by_prompt_and_images(prompt, images):
     logger.info("response={}", text_outputs)
     return text_outputs
 if __name__ == "__main__":
     inference()

             model_path, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
     else:
         tokenizer, model, image_processor, max_length = load_pretrained_model(
+            lora_checkpoint, model_path, "llava_lora", device_map=device_map)
     model.eval()
     model.tie_weights()
+    logger.info("model device {}", model.device)
     return tokenizer, model, image_processor, conv_template
+tokenizer_llava, model_llava, image_processor_llava, conv_template_llava = load_llava_model(None)
+tokenizer_llava_fire, model_llava_fire, image_processor_llava_fire, conv_template_llava = load_llava_model("checkpoints/")
+model_llava_fire.to("cuda")
 @spaces.GPU
 def inference():
     image = Image.open("assets/example.jpg").convert("RGB")
     image_tensor = image_tensor.to(dtype=torch.float16, device=device)
     input_ids = tokenizer_image_token(prompt, tokenizer_llava, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
     image_sizes = [image.size for image in images]
+    logger.info("Shape: {};{}; Devices: {};{}",input_ids.shape, image_tensor.shape, input_ids.device, image_tensor.device)
     with torch.inference_mode():
         cont = model_llava.generate(
             input_ids,
     logger.info("response={}", text_outputs)
     return text_outputs
+@spaces.GPU
+def inference_by_prompt_and_images_fire(prompt, images):
+    device = "cuda"
+    if len(images) > 0 and type(images[0]) is str:
+        image_data = []
+        for image in images:
+            image_data.append(Image.open(BytesIO(base64.b64decode(image))))
+        images = image_data
+    image_tensor = process_images(images, image_processor_llava, model_llava.config)
+    image_tensor = image_tensor.to(dtype=torch.float16, device=device)
+    input_ids = tokenizer_image_token(prompt, tokenizer_llava, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
+    image_sizes = [image.size for image in images]
+    logger.info("Shape: {};{}; Devices: {};{}",input_ids.shape, image_tensor.shape, input_ids.device, image_tensor.device)
+    with torch.inference_mode():
+        cont = model_llava_fire.generate(
+            input_ids,
+            images=image_tensor,
+            image_sizes=image_sizes,
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=256,
+            use_cache=True
+        )
+    text_outputs = tokenizer_llava.batch_decode(cont, skip_special_tokens=True)
+    logger.info("response={}", text_outputs)
+    return text_outputs
 if __name__ == "__main__":
     inference()

src/serve/__pycache__/gradio_block_arena_vision_named.cpython-310.pyc CHANGED Viewed

Binary files a/src/serve/__pycache__/gradio_block_arena_vision_named.cpython-310.pyc and b/src/serve/__pycache__/gradio_block_arena_vision_named.cpython-310.pyc differ

src/serve/__pycache__/gradio_web_server.cpython-310.pyc CHANGED Viewed

Binary files a/src/serve/__pycache__/gradio_web_server.cpython-310.pyc and b/src/serve/__pycache__/gradio_web_server.cpython-310.pyc differ

src/serve/gradio_block_arena_vision_named.py CHANGED Viewed

@@ -243,8 +243,7 @@ def add_text(
 def build_side_by_side_vision_ui_named(models, random_questions=None):
     notice_markdown = """
-# ⚔️  Vision Arena ⚔️ : Benchmarking VLMs (FIRE-LLAVA VS. LLAVA)
-| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 ## 📜 Rules
 - Chat with any two models side-by-side and vote!
@@ -334,7 +333,11 @@ def build_side_by_side_vision_ui_named(models, random_questions=None):
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         share_btn = gr.Button(value="📷  Share")
     with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
@@ -402,7 +405,7 @@ def build_side_by_side_vision_ui_named(models, random_questions=None):
         flash_buttons, [], btn_list
     )
     clear_btn.click(clear_history, None, states + chatbots + [textbox] + btn_list)
     share_js = """
 function (a, b, c, d) {
     const captureElement = document.querySelector('#share-region-named');

 def build_side_by_side_vision_ui_named(models, random_questions=None):
     notice_markdown = """
+# ⚔️  Vision Arena ⚔️ : Benchmarking LLAVA-FIRE VS. LLAVA
 ## 📜 Rules
 - Chat with any two models side-by-side and vote!
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         share_btn = gr.Button(value="📷  Share")
+    with gr.Row():
+        gr.Examples(examples=[
+            [{"files":["assets/chart.png"], "text": "What's the percentage value of Japan who have a favorable view of the US? Answer the question using a single word or phrase."}],
+            [{"files":["assets/clver_1.jpg"], "text": "Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\nQuestion: Is the number of metal cars that are left of the tiny matte school bus greater than the number of tiny cyan double buss?\nChoices:\n(A) Yes\n(B) No"}],
+        ],inputs=[textbox])
     with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
         flash_buttons, [], btn_list
     )
     clear_btn.click(clear_history, None, states + chatbots + [textbox] + btn_list)
     share_js = """
 function (a, b, c, d) {
     const captureElement = document.querySelector('#share-region-named');

src/serve/gradio_web_server.py CHANGED Viewed

@@ -410,7 +410,7 @@ def bot_response(
     top_p,
     max_new_tokens,
     request: gr.Request,
-    apply_rate_limit=True,
     use_recommended_config=False,
 ):
     ip = get_ip(request)
@@ -440,14 +440,16 @@ def bot_response(
         api_endpoint_info[model_name] if model_name in api_endpoint_info else None
     )
     images = conv.get_images()
-    logger.info(f"model_name: {model_name};model_api_dict: {model_api_dict}")
     if model_api_dict is None:
         if model_name == "llava-original":
-            from src.model.model_llava import inference, inference_by_prompt_and_images
-            logger.info(f"prompt: {conv.get_prompt()}; images: {images}")
             output_text = inference_by_prompt_and_images(conv.get_prompt(), images)[0]
         else:
-            output_text = "hello"
         stream_iter = [{
             "error_code": 0,
             "text": output_text

     top_p,
     max_new_tokens,
     request: gr.Request,
+    apply_rate_limit=False,
     use_recommended_config=False,
 ):
     ip = get_ip(request)
         api_endpoint_info[model_name] if model_name in api_endpoint_info else None
     )
     images = conv.get_images()
+    logger.info(f"model_name: {model_name}; model_api_dict: {model_api_dict}; msg: {conv.messages}")
     if model_api_dict is None:
         if model_name == "llava-original":
+            from src.model.model_llava import inference_by_prompt_and_images
+            logger.info(f"prompt for llava-original: {conv.get_prompt()}; images: {len(images)}")
             output_text = inference_by_prompt_and_images(conv.get_prompt(), images)[0]
         else:
+            from src.model.model_llava import inference_by_prompt_and_images_fire
+            logger.info(f"prompt for llava-fire: {conv.get_prompt()}; images: {len(images)}")
+            output_text = inference_by_prompt_and_images_fire(conv.get_prompt(), images)[0]
         stream_iter = [{
             "error_code": 0,
             "text": output_text