Spaces:

dan-durbin
/

Kosmos-2.5

Running on Zero

App Files Files Community

dan-durbin commited on Aug 5, 2024

Commit

0b1f1a9

1 Parent(s): 9aa5f5e

claude 3.5-assisted interface changes to allow for switching between markdown and OCR modes, parameter adjusting

Browse files

Files changed (1) hide show

app.py +25 -12

app.py CHANGED Viewed

@@ -17,11 +17,10 @@ model = AutoModelForVision2Seq.from_pretrained(
 processor = AutoProcessor.from_pretrained(repo)
-prompt = "<ocr>"  # Options are '<ocr>' and '<md>'
 @spaces.GPU
-def process_image(image_path):
     image = Image.open(image_path)
     inputs = processor(text=prompt, images=image, return_tensors="pt")
@@ -33,17 +32,23 @@ def process_image(image_path):
     inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
-    generated_ids = model.generate(**inputs, max_new_tokens=2048)
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return postprocess(generated_text, scale_height, scale_width, image)
-def postprocess(y, scale_height, scale_width, original_image):
     y = y.replace(prompt, "")
     if "<md>" in prompt:
-        return y, original_image
     pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
     bboxs_raw = re.findall(pattern, y)
@@ -54,7 +59,6 @@ def postprocess(y, scale_height, scale_width, original_image):
     info = ""
-    # Create a copy of the original image to draw on
     image_with_boxes = original_image.copy()
     draw = ImageDraw.Draw(image_with_boxes)
@@ -69,7 +73,6 @@ def postprocess(y, scale_height, scale_width, original_image):
             y1 = int(y1 * scale_height)
             info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}\n"
-            # Draw rectangle on the image
             draw.rectangle([x0, y0, x1, y1], outline="red", width=2)
     return image_with_boxes, info
@@ -77,11 +80,21 @@ def postprocess(y, scale_height, scale_width, original_image):
 iface = gr.Interface(
     fn=process_image,
-    inputs=gr.Image(type="filepath"),
     outputs=[
-        gr.Image(type="pil", label="Image with Bounding Boxes"),
-        gr.Textbox(label="Extracted Text"),
     ],
 )
 iface.launch()

 processor = AutoProcessor.from_pretrained(repo)
 @spaces.GPU
+def process_image(image_path, task, num_beams, max_new_tokens, temperature):
+    prompt = "<ocr>" if task == "OCR" else "<md>"
     image = Image.open(image_path)
     inputs = processor(text=prompt, images=image, return_tensors="pt")
     inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
+    generated_ids = model.generate(
+        **inputs,
+        num_beams=num_beams,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+    )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return postprocess(generated_text, scale_height, scale_width, image, prompt)
+@spaces.GPU
+def postprocess(y, scale_height, scale_width, original_image, prompt):
     y = y.replace(prompt, "")
     if "<md>" in prompt:
+        return original_image, y
     pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
     bboxs_raw = re.findall(pattern, y)
     info = ""
     image_with_boxes = original_image.copy()
     draw = ImageDraw.Draw(image_with_boxes)
             y1 = int(y1 * scale_height)
             info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}\n"
             draw.rectangle([x0, y0, x1, y1], outline="red", width=2)
     return image_with_boxes, info
 iface = gr.Interface(
     fn=process_image,
+    inputs=[
+        gr.Image(type="filepath", label="Input Image"),
+        gr.Radio(["OCR", "Markdown"], label="Task", value="OCR"),
+        gr.Slider(1, 10, value=4, step=1, label="Number of Beams"),
+        gr.Slider(100, 4000, value=2048, step=100, label="Max New Tokens"),
+        gr.Slider(0.1, 1.0, value=1.0, step=0.1, label="Temperature"),
+    ],
     outputs=[
+        gr.Image(type="pil", label="Image with Bounding Boxes (OCR only)"),
+        gr.Textbox(label="Extracted Text / Markdown"),
     ],
+    title="Kosmos 2.5 OCR and Markdown Generator",
+    description="""Generate OCR results or Markdown from images using Kosmos 2.5.
+    Uses the Kosmos 2.5 [PR Branch] of the Transformers library for inference.
+    I don't know if the parameters do much of anything, but they're available for tweaking just in case.""",
 )
 iface.launch()