kirp@umich.edu
commited on
Commit
•
629edc1
1
Parent(s):
ff64714
README.md
CHANGED
@@ -15,12 +15,7 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
|
|
15 |
## NOTE
|
16 |
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
17 |
|
18 |
-
##
|
19 |
-
This repo will be soon merged to official Transformers.
|
20 |
-
```bash
|
21 |
-
pip install git+https://github.com/tic-top/transformers.git
|
22 |
-
```
|
23 |
-
|
24 |
### Markdown Task
|
25 |
Run with [md.py](md.py).
|
26 |
```text
|
|
|
15 |
## NOTE
|
16 |
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
17 |
|
18 |
+
## Usage
|
|
|
|
|
|
|
|
|
|
|
19 |
### Markdown Task
|
20 |
Run with [md.py](md.py).
|
21 |
```text
|
md.py
CHANGED
@@ -12,7 +12,6 @@ processor = AutoProcessor.from_pretrained(repo)
|
|
12 |
|
13 |
# sample image
|
14 |
url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
|
15 |
-
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
16 |
image = Image.open(requests.get(url, stream=True).raw)
|
17 |
|
18 |
prompt = "<md>"
|
|
|
12 |
|
13 |
# sample image
|
14 |
url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
|
|
|
15 |
image = Image.open(requests.get(url, stream=True).raw)
|
16 |
|
17 |
prompt = "<md>"
|
ocr.py
CHANGED
@@ -5,17 +5,16 @@ from PIL import Image, ImageDraw
|
|
5 |
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
6 |
|
7 |
repo = "microsoft/kosmos-2.5"
|
8 |
-
device = "cuda:
|
9 |
dtype = torch.bfloat16
|
10 |
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
|
11 |
processor = AutoProcessor.from_pretrained(repo)
|
12 |
|
13 |
# sample image
|
14 |
url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
|
15 |
-
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
16 |
image = Image.open(requests.get(url, stream=True).raw)
|
17 |
|
18 |
-
#
|
19 |
prompt = "<ocr>"
|
20 |
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
21 |
height, width = inputs.pop("height"), inputs.pop("width")
|
@@ -23,12 +22,12 @@ raw_width, raw_height = image.size
|
|
23 |
scale_height = raw_height / height
|
24 |
scale_width = raw_width / width
|
25 |
|
26 |
-
# batch
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
34 |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
|
|
5 |
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
6 |
|
7 |
repo = "microsoft/kosmos-2.5"
|
8 |
+
device = "cuda:1"
|
9 |
dtype = torch.bfloat16
|
10 |
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
|
11 |
processor = AutoProcessor.from_pretrained(repo)
|
12 |
|
13 |
# sample image
|
14 |
url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
|
|
|
15 |
image = Image.open(requests.get(url, stream=True).raw)
|
16 |
|
17 |
+
# bs = 1
|
18 |
prompt = "<ocr>"
|
19 |
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
20 |
height, width = inputs.pop("height"), inputs.pop("width")
|
|
|
22 |
scale_height = raw_height / height
|
23 |
scale_width = raw_width / width
|
24 |
|
25 |
+
# bs > 1, batch decoding sample
|
26 |
+
inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
|
27 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
28 |
+
raw_width, raw_height = image.size
|
29 |
+
scale_height = raw_height / height[0]
|
30 |
+
scale_width = raw_width / width[0]
|
31 |
|
32 |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
33 |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
output.png
CHANGED
Git LFS Details
|
Git LFS Details
|