kirp@umich.edu commited on
Commit
ff64714
1 Parent(s): b175204

add comments

Browse files
Files changed (3) hide show
  1. README.md +18 -21
  2. md.py +4 -2
  3. ocr.py +12 -6
README.md CHANGED
@@ -2,7 +2,7 @@
2
  language: en
3
  license: mit
4
  ---
5
- # Kosmos-2.5 (Testing)
6
 
7
  [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
8
 
@@ -15,14 +15,27 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
15
  ## NOTE
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
- ## Download transformers:
 
19
  ```bash
20
  pip install git+https://github.com/tic-top/transformers.git
21
  ```
22
- This repo will be soon merged to official Transformers.
23
 
24
- ## Usage
25
- ### OCR
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  Run with [ocr.py](ocr.py).
27
  ```text
28
  55,595,71,595,71,629,55,629,1
@@ -41,22 +54,6 @@ Run with [ocr.py](ocr.py).
41
  17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
42
  ```
43
  ![output](output.png)
44
- ### Markdown
45
- Run with [md.py](md.py).
46
-
47
- ```text
48
- - **1 \[REG\] BLACK SAKURA** 45,455
49
- - **1 COOKIE DOH SAUCES** 0
50
- - **1 NATA DE COCO** 0
51
- - **Sub Total** 45,455
52
- - **PB1 (10%)** 4,545
53
- - **Rounding** 0
54
- - **Total** **50,000**
55
-
56
- Card Payment 50,000
57
- ```
58
-
59
-
60
 
61
 
62
  ## Citation
 
2
  language: en
3
  license: mit
4
  ---
5
+ # Kosmos-2.5
6
 
7
  [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
8
 
 
15
  ## NOTE
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
+ ## How to Use?
19
+ This repo will be soon merged to official Transformers.
20
  ```bash
21
  pip install git+https://github.com/tic-top/transformers.git
22
  ```
 
23
 
24
+ ### Markdown Task
25
+ Run with [md.py](md.py).
26
+ ```text
27
+ - **1 \[REG\] BLACK SAKURA** 45,455
28
+ - **1 COOKIE DOH SAUCES** 0
29
+ - **1 NATA DE COCO** 0
30
+ - **Sub Total** 45,455
31
+ - **PB1 (10%)** 4,545
32
+ - **Rounding** 0
33
+ - **Total** **50,000**
34
+
35
+ Card Payment 50,000
36
+ ```
37
+
38
+ ### OCR Task
39
  Run with [ocr.py](ocr.py).
40
  ```text
41
  55,595,71,595,71,629,55,629,1
 
54
  17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
55
  ```
56
  ![output](output.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  ## Citation
md.py CHANGED
@@ -4,15 +4,17 @@ import requests
4
  from PIL import Image, ImageDraw
5
  from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
 
7
- repo = "microsoft/kosmos-2.5" #
8
- repo = "kirp/kosmos2_5"
9
  device = "cuda:0"
10
  dtype = torch.bfloat16
11
  model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
12
  processor = AutoProcessor.from_pretrained(repo)
13
 
 
 
14
  url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
15
  image = Image.open(requests.get(url, stream=True).raw)
 
16
  prompt = "<md>"
17
  inputs = processor(text=prompt, images=image, return_tensors="pt")
18
 
 
4
  from PIL import Image, ImageDraw
5
  from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
 
7
+ repo = "microsoft/kosmos-2.5"
 
8
  device = "cuda:0"
9
  dtype = torch.bfloat16
10
  model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
11
  processor = AutoProcessor.from_pretrained(repo)
12
 
13
+ # sample image
14
+ url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
15
  url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
16
  image = Image.open(requests.get(url, stream=True).raw)
17
+
18
  prompt = "<md>"
19
  inputs = processor(text=prompt, images=image, return_tensors="pt")
20
 
ocr.py CHANGED
@@ -4,26 +4,32 @@ import requests
4
  from PIL import Image, ImageDraw
5
  from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
 
7
-
8
- repo = "microsoft/kosmos-2.5" #
9
- repo = "kirp/kosmos2_5"
10
  device = "cuda:0"
11
  dtype = torch.bfloat16
12
  model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
13
  processor = AutoProcessor.from_pretrained(repo)
14
 
 
 
15
  url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
16
  image = Image.open(requests.get(url, stream=True).raw)
 
 
17
  prompt = "<ocr>"
18
  inputs = processor(text=prompt, images=image, return_tensors="pt")
19
- # batch input
20
- # inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
21
-
22
  height, width = inputs.pop("height"), inputs.pop("width")
23
  raw_width, raw_height = image.size
24
  scale_height = raw_height / height
25
  scale_width = raw_width / width
26
 
 
 
 
 
 
 
 
27
  inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
28
  inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
29
  generated_ids = model.generate(
 
4
  from PIL import Image, ImageDraw
5
  from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
 
7
+ repo = "microsoft/kosmos-2.5"
 
 
8
  device = "cuda:0"
9
  dtype = torch.bfloat16
10
  model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
11
  processor = AutoProcessor.from_pretrained(repo)
12
 
13
+ # sample image
14
+ url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
15
  url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
16
  image = Image.open(requests.get(url, stream=True).raw)
17
+
18
+ # singe image
19
  prompt = "<ocr>"
20
  inputs = processor(text=prompt, images=image, return_tensors="pt")
 
 
 
21
  height, width = inputs.pop("height"), inputs.pop("width")
22
  raw_width, raw_height = image.size
23
  scale_height = raw_height / height
24
  scale_width = raw_width / width
25
 
26
+ # batch generate
27
+ # inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
28
+ # height, width = inputs.pop("height"), inputs.pop("width")
29
+ # raw_width, raw_height = image.size
30
+ # scale_height = raw_height / height[0]
31
+ # scale_width = raw_width / width[0]
32
+
33
  inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
34
  inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
35
  generated_ids = model.generate(