Ubuntu commited on
Commit
f39cf74
1 Parent(s): edb09fc

support hf

Browse files
Files changed (5) hide show
  1. README.md +19 -51
  2. config.json +1 -1
  3. md.py +33 -0
  4. ocr.py +73 -0
  5. tokenizer.json +2 -2
README.md CHANGED
@@ -12,59 +12,26 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
12
 
13
  [Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
14
 
15
- ## NOTE:
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
- ## Use with transformers:
19
- ```python
20
- from PIL import Image
21
- import requests
22
- import torch
23
- from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
24
- import re
25
- repo = "microsoft/kosmos-2.5"
26
- device = "cuda:0"
27
- dtype = torch.bfloat16
28
- model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
29
- processor = AutoProcessor.from_pretrained(repo)
30
- url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
31
- image = Image.open(requests.get(url, stream=True).raw)
32
- prompt = "<ocr>" # <md>
33
- inputs = processor(text=prompt, images=image, return_tensors="pt")
34
- height, width = inputs.pop("height"), inputs.pop("width")
35
- raw_width, raw_height = image.size
36
- scale_height = raw_height / height
37
- scale_width = raw_width / width
38
- inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
39
- inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
40
- generated_ids = model.generate(
41
- **inputs,
42
- max_new_tokens=1024,
43
- )
44
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
45
- def postprocess(y, scale_height, scale_width):
46
- y = y.replace(prompt, "")
47
- if "<md>" in prompt:
48
- return y
49
- pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
50
- bboxs_raw = re.findall(pattern, y)
51
- lines = re.split(pattern, y)[1:]
52
- bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
53
- bboxs = [[int(j) for j in i] for i in bboxs]
54
- info = ""
55
- for i in range(len(lines)):
56
- box = bboxs[i]
57
- x0, y0, x1, y1 = box
58
- if not (x0 >= x1 or y0 >= y1):
59
- x0 = int(x0 * scale_width)
60
- y0 = int(y0 * scale_height)
61
- x1 = int(x1 * scale_width)
62
- y1 = int(y1 * scale_height)
63
- info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
64
- return info
65
- output_text = postprocess(generated_text[0], scale_height, scale_width)
66
- print(output_text)
67
  ```
 
 
 
68
  ```text
69
  55,595,71,595,71,629,55,629,1
70
  82,595,481,595,481,635,82,635,[REG] BLACK SAKURA
@@ -81,7 +48,7 @@ print(output_text)
81
  24,905,858,905,858,956,24,956,Total 50,000
82
  17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
83
  ```
84
-
85
 
86
 
87
  ## Citation
@@ -103,3 +70,4 @@ The content of this project itself is licensed under the [MIT](https://github.co
103
  [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
104
 
105
 
 
 
12
 
13
  [Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
14
 
15
+ ## NOTE
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
+ ## Usage
19
+ ### Markdown Task
20
+ Run with [md.py](md.py).
21
+ ```text
22
+ - **1 \[REG\] BLACK SAKURA** 45,455
23
+ - **1 COOKIE DOH SAUCES** 0
24
+ - **1 NATA DE COCO** 0
25
+ - **Sub Total** 45,455
26
+ - **PB1 (10%)** 4,545
27
+ - **Rounding** 0
28
+ - **Total** **50,000**
29
+
30
+ Card Payment 50,000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ```
32
+
33
+ ### OCR Task
34
+ Run with [ocr.py](ocr.py).
35
  ```text
36
  55,595,71,595,71,629,55,629,1
37
  82,595,481,595,481,635,82,635,[REG] BLACK SAKURA
 
48
  24,905,858,905,858,956,24,956,Total 50,000
49
  17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
50
  ```
51
+ ![output](output.png)
52
 
53
 
54
  ## Citation
 
70
  [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
71
 
72
 
73
+
config.json CHANGED
@@ -148,4 +148,4 @@
148
  "typical_p": 1.0,
149
  "use_bfloat16": false
150
  }
151
- }
 
148
  "typical_p": 1.0,
149
  "use_bfloat16": false
150
  }
151
+ }
md.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import requests
4
+ from PIL import Image, ImageDraw
5
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
+
7
+ repo = "microsoft/kosmos-2.5"
8
+ device = "cuda:0"
9
+ dtype = torch.bfloat16
10
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
11
+ processor = AutoProcessor.from_pretrained(repo)
12
+
13
+ # sample image
14
+ url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
15
+ image = Image.open(requests.get(url, stream=True).raw)
16
+
17
+ prompt = "<md>"
18
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
19
+
20
+ height, width = inputs.pop("height"), inputs.pop("width")
21
+ raw_width, raw_height = image.size
22
+ scale_height = raw_height / height
23
+ scale_width = raw_width / width
24
+
25
+ inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
26
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
27
+ generated_ids = model.generate(
28
+ **inputs,
29
+ max_new_tokens=1024,
30
+ )
31
+
32
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
33
+ print(generated_text[0])
ocr.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import requests
4
+ from PIL import Image, ImageDraw
5
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
+
7
+ repo = "microsoft/kosmos-2.5"
8
+ device = "cuda:0"
9
+ dtype = torch.bfloat16
10
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
11
+ processor = AutoProcessor.from_pretrained(repo)
12
+
13
+ # sample image
14
+ url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
15
+ image = Image.open(requests.get(url, stream=True).raw)
16
+
17
+ # bs = 1
18
+ prompt = "<ocr>"
19
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
20
+ height, width = inputs.pop("height"), inputs.pop("width")
21
+ raw_width, raw_height = image.size
22
+ scale_height = raw_height / height
23
+ scale_width = raw_width / width
24
+
25
+ # bs > 1, batch generation
26
+ # inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
27
+ # height, width = inputs.pop("height"), inputs.pop("width")
28
+ # raw_width, raw_height = image.size
29
+ # scale_height = raw_height / height[0]
30
+ # scale_width = raw_width / width[0]
31
+
32
+ inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
33
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
34
+ generated_ids = model.generate(
35
+ **inputs,
36
+ max_new_tokens=1024,
37
+ )
38
+
39
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
40
+ def post_process(y, scale_height, scale_width):
41
+ y = y.replace(prompt, "")
42
+ if "<md>" in prompt:
43
+ return y
44
+ pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
45
+ bboxs_raw = re.findall(pattern, y)
46
+ lines = re.split(pattern, y)[1:]
47
+ bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
48
+ bboxs = [[int(j) for j in i] for i in bboxs]
49
+ info = ""
50
+ for i in range(len(lines)):
51
+ box = bboxs[i]
52
+ x0, y0, x1, y1 = box
53
+ if not (x0 >= x1 or y0 >= y1):
54
+ x0 = int(x0 * scale_width)
55
+ y0 = int(y0 * scale_height)
56
+ x1 = int(x1 * scale_width)
57
+ y1 = int(y1 * scale_height)
58
+ info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
59
+ return info
60
+
61
+ output_text = post_process(generated_text[0], scale_height, scale_width)
62
+ print(output_text)
63
+
64
+ draw = ImageDraw.Draw(image)
65
+ lines = output_text.split("\n")
66
+ for line in lines:
67
+ # draw the bounding box
68
+ line = list(line.split(","))
69
+ if len(line) < 8:
70
+ continue
71
+ line = list(map(int, line[:8]))
72
+ draw.polygon(line, outline="red")
73
+ image.save("output.png")
tokenizer.json CHANGED
@@ -91,7 +91,7 @@
91
  "lstrip": true,
92
  "rstrip": false,
93
  "normalized": false,
94
- "special": false
95
  },
96
  {
97
  "id": 100283,
@@ -145,7 +145,7 @@
145
  "lstrip": true,
146
  "rstrip": false,
147
  "normalized": false,
148
- "special": false
149
  },
150
  {
151
  "id": 100289,
 
91
  "lstrip": true,
92
  "rstrip": false,
93
  "normalized": false,
94
+ "special": true
95
  },
96
  {
97
  "id": 100283,
 
145
  "lstrip": true,
146
  "rstrip": false,
147
  "normalized": false,
148
+ "special": true
149
  },
150
  {
151
  "id": 100289,