kirp
/

kosmos2_5

@@ -2,7 +2,7 @@
 language: en
 license: mit
 ---
-# Kosmos-2.5 (Testing)
 [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
@@ -15,14 +15,27 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
 ## NOTE
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
-## Download transformers：
 ```bash
 pip install git+https://github.com/tic-top/transformers.git
 ```
-This repo will be soon merged to official Transformers.
-## Usage
-### OCR
 Run with [ocr.py](ocr.py).
 ```text
 55,595,71,595,71,629,55,629,1
@@ -41,22 +54,6 @@ Run with [ocr.py](ocr.py).
 17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
 ```
 ![output](output.png)
-### Markdown
-Run with [md.py](md.py).
-```text
-- **1 \[REG\] BLACK SAKURA** 45,455
-- **1 COOKIE DOH SAUCES** 0
-- **1 NATA DE COCO** 0
-- **Sub Total** 45,455
-- **PB1 (10%)** 4,545
-- **Rounding** 0
-- **Total** **50,000**
-Card Payment 50,000
-```
 ## Citation

 language: en
 license: mit
 ---
+# Kosmos-2.5
 [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
 ## NOTE
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
+## How to Use?
+This repo will be soon merged to official Transformers.
 ```bash
 pip install git+https://github.com/tic-top/transformers.git
 ```
+### Markdown Task
+Run with [md.py](md.py).
+```text
+- **1 \[REG\] BLACK SAKURA** 45,455
+- **1 COOKIE DOH SAUCES** 0
+- **1 NATA DE COCO** 0
+- **Sub Total** 45,455
+- **PB1 (10%)** 4,545
+- **Rounding** 0
+- **Total** **50,000**
+Card Payment 50,000
+```
+### OCR Task
 Run with [ocr.py](ocr.py).
 ```text
 55,595,71,595,71,629,55,629,1
 17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
 ```
 ![output](output.png)
 ## Citation

md.py CHANGED Viewed

@@ -4,15 +4,17 @@ import requests
 from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
-repo = "microsoft/kosmos-2.5" #
-repo = "kirp/kosmos2_5"
 device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
 url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<md>"
 inputs = processor(text=prompt, images=image, return_tensors="pt")

 from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
+repo = "microsoft/kosmos-2.5"
 device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
+# sample image
+url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<md>"
 inputs = processor(text=prompt, images=image, return_tensors="pt")

ocr.py CHANGED Viewed

@@ -4,26 +4,32 @@ import requests
 from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
-repo = "microsoft/kosmos-2.5" #
-repo = "kirp/kosmos2_5"
 device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
 url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<ocr>"
 inputs = processor(text=prompt, images=image, return_tensors="pt")
-# batch input
-# inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
 height, width = inputs.pop("height"), inputs.pop("width")
 raw_width, raw_height = image.size
 scale_height = raw_height / height
 scale_width = raw_width / width
 inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
 inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
 generated_ids = model.generate(

 from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
+repo = "microsoft/kosmos-2.5"
 device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
+# sample image
+url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
+# singe image
 prompt = "<ocr>"
 inputs = processor(text=prompt, images=image, return_tensors="pt")
 height, width = inputs.pop("height"), inputs.pop("width")
 raw_width, raw_height = image.size
 scale_height = raw_height / height
 scale_width = raw_width / width
+# batch generate
+# inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
+# height, width = inputs.pop("height"), inputs.pop("width")
+# raw_width, raw_height = image.size
+# scale_height = raw_height / height[0]
+# scale_width = raw_width / width[0]
 inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
 inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
 generated_ids = model.generate(