Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

geekyrakshit commited on Oct 23

Commit

7934a8e

•

1 Parent(s): e4a917d

update: fix bug in LLMClient + add FigureAnnotator

Browse files

Files changed (6) hide show

.gitignore +1 -0
medrag_multi_modal/assistant/__init__.py +3 -2
medrag_multi_modal/assistant/figure_annotation.py +53 -0
medrag_multi_modal/assistant/llm_client.py +2 -2
medrag_multi_modal/utils.py +10 -0
pyproject.toml +2 -0

.gitignore CHANGED Viewed

@@ -17,6 +17,7 @@ wandb/
 .byaldi/
 cursor_prompt.txt
 test.py
 uv.lock
 grays-anatomy-bm25s/
 prompt**.txt

 .byaldi/
 cursor_prompt.txt
 test.py
+test.ipynb
 uv.lock
 grays-anatomy-bm25s/
 prompt**.txt

medrag_multi_modal/assistant/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from .llm_client import LLMClient
 from .medqa_assistant import MedQAAssistant
-__all__ = ["LLMClient", "MedQAAssistant"]

+from .figure_annotation import FigureAnnotator
+from .llm_client import ClientType, LLMClient
 from .medqa_assistant import MedQAAssistant
+__all__ = ["LLMClient", "ClientType", "MedQAAssistant", "FigureAnnotator"]

medrag_multi_modal/assistant/figure_annotation.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from typing import Union
+import cv2
+import weave
+from PIL import Image
+from rich.progress import track
+from ..utils import get_wandb_artifact, read_jsonl_file
+from .llm_client import LLMClient
+class FigureAnnotator(weave.Model):
+    llm_client: LLMClient
+    @weave.op()
+    def annotate_figures(
+        self, page_image: Image.Image
+    ) -> dict[str, Union[Image.Image, str]]:
+        annotation = self.llm_client.predict(
+            system_prompt="""
+You are an expert in the domain of scientific textbooks, especially medical texts.
+You are presented with a page from a scientific textbook.
+You are to first identify the number of figures in the image.
+Then you are to identify the figure IDs associated with each figure in the image.
+Then, you are to extract the exact figure descriptions from the image.
+Here are some clues you need to follow:
+1. Figure IDs are unique identifiers for each figure in the image.
+2. Sometimes figure IDs can also be found as captions to the immediate left, right, top, or bottom of the figure.
+3. Figure IDs are in the form "Fig X.Y" where X and Y are integers. For example, 1.1, 1.2, 1.3, etc.
+4. Figure descriptions are contained as captions under the figures in the image, just after the figure ID.
+5. The text in the image is written in English and is present in a two-column format.
+6. There is a clear distinction between the figure caption and the regular text in the image in the form of extra white space.
+7. There might be multiple figures present in the image.
+""",
+            user_prompt=[page_image],
+        )
+        return {"page_image": page_image, "annotations": annotation}
+    @weave.op()
+    def predict(self, image_artifact_address: str):
+        artifact_dir = get_wandb_artifact(image_artifact_address, "dataset")
+        metadata = read_jsonl_file(os.path.join(artifact_dir, "metadata.jsonl"))
+        annotations = []
+        for item in track(metadata, description="Annotating images:"):
+            page_image = cv2.imread(
+                os.path.join(artifact_dir, f"page{item['page_idx']}.png")
+            )
+            page_image = cv2.cvtColor(page_image, cv2.COLOR_BGR2RGB)
+            page_image = Image.fromarray(page_image)
+            annotations.append(self.annotate_figures(page_image=page_image))
+        return annotations

medrag_multi_modal/assistant/llm_client.py CHANGED Viewed

@@ -9,7 +9,7 @@ from PIL import Image
 from ..utils import base64_encode_image
-class ClientType(Enum, str):
     GEMINI = "gemini"
     MISTRAL = "mistral"
@@ -80,7 +80,7 @@ class LLMClient(weave.Model):
         ]
         client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
-        client = instructor.from_mistral(client)
         response = (
             client.chat.complete(model=self.model_name, messages=messages)

 from ..utils import base64_encode_image
+class ClientType(str, Enum):
     GEMINI = "gemini"
     MISTRAL = "mistral"
         ]
         client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
+        client = instructor.from_mistral(client) if schema is not None else client
         response = (
             client.chat.complete(model=self.model_name, messages=messages)

medrag_multi_modal/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import base64
 import io
 import torch
 from PIL import Image
@@ -36,8 +37,17 @@ def get_torch_backend():
 def base64_encode_image(image: Image.Image, mimetype: str) -> str:
     byte_arr = io.BytesIO()
     image.save(byte_arr, format="PNG")
     encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
     encoded_string = f"data:{mimetype};base64,{encoded_string}"
     return str(encoded_string)

 import base64
 import io
+import jsonlines
 import torch
 from PIL import Image
 def base64_encode_image(image: Image.Image, mimetype: str) -> str:
+    image.load()
+    if image.mode not in ("RGB", "RGBA"):
+        image = image.convert("RGB")
     byte_arr = io.BytesIO()
     image.save(byte_arr, format="PNG")
     encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
     encoded_string = f"data:{mimetype};base64,{encoded_string}"
     return str(encoded_string)
+def read_jsonl_file(file_path: str) -> list[dict[str, any]]:
+    with jsonlines.open(file_path) as reader:
+        for obj in reader:
+            return obj

pyproject.toml CHANGED Viewed

@@ -42,6 +42,7 @@ dependencies = [
     "mistralai>=1.1.0",
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
 ]
 [project.optional-dependencies]
@@ -69,6 +70,7 @@ core = [
     "mistralai>=1.1.0",
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
 ]
 dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]

     "mistralai>=1.1.0",
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
+    "opencv-python>=4.10.0.84",
 ]
 [project.optional-dependencies]
     "mistralai>=1.1.0",
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
+    "opencv-python>=4.10.0.84",
 ]
 dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]