kirp@umich.edu commited on
Commit
629edc1
1 Parent(s): ff64714
Files changed (4) hide show
  1. README.md +1 -6
  2. md.py +0 -1
  3. ocr.py +8 -9
  4. output.png +2 -2
README.md CHANGED
@@ -15,12 +15,7 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
15
  ## NOTE
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
- ## How to Use?
19
- This repo will be soon merged to official Transformers.
20
- ```bash
21
- pip install git+https://github.com/tic-top/transformers.git
22
- ```
23
-
24
  ### Markdown Task
25
  Run with [md.py](md.py).
26
  ```text
 
15
  ## NOTE
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
+ ## Usage
 
 
 
 
 
19
  ### Markdown Task
20
  Run with [md.py](md.py).
21
  ```text
md.py CHANGED
@@ -12,7 +12,6 @@ processor = AutoProcessor.from_pretrained(repo)
12
 
13
  # sample image
14
  url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
15
- url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
16
  image = Image.open(requests.get(url, stream=True).raw)
17
 
18
  prompt = "<md>"
 
12
 
13
  # sample image
14
  url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 
15
  image = Image.open(requests.get(url, stream=True).raw)
16
 
17
  prompt = "<md>"
ocr.py CHANGED
@@ -5,17 +5,16 @@ from PIL import Image, ImageDraw
5
  from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
 
7
  repo = "microsoft/kosmos-2.5"
8
- device = "cuda:0"
9
  dtype = torch.bfloat16
10
  model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
11
  processor = AutoProcessor.from_pretrained(repo)
12
 
13
  # sample image
14
  url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
15
- url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
16
  image = Image.open(requests.get(url, stream=True).raw)
17
 
18
- # singe image
19
  prompt = "<ocr>"
20
  inputs = processor(text=prompt, images=image, return_tensors="pt")
21
  height, width = inputs.pop("height"), inputs.pop("width")
@@ -23,12 +22,12 @@ raw_width, raw_height = image.size
23
  scale_height = raw_height / height
24
  scale_width = raw_width / width
25
 
26
- # batch generate
27
- # inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
28
- # height, width = inputs.pop("height"), inputs.pop("width")
29
- # raw_width, raw_height = image.size
30
- # scale_height = raw_height / height[0]
31
- # scale_width = raw_width / width[0]
32
 
33
  inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
34
  inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
 
5
  from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
 
7
  repo = "microsoft/kosmos-2.5"
8
+ device = "cuda:1"
9
  dtype = torch.bfloat16
10
  model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
11
  processor = AutoProcessor.from_pretrained(repo)
12
 
13
  # sample image
14
  url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 
15
  image = Image.open(requests.get(url, stream=True).raw)
16
 
17
+ # bs = 1
18
  prompt = "<ocr>"
19
  inputs = processor(text=prompt, images=image, return_tensors="pt")
20
  height, width = inputs.pop("height"), inputs.pop("width")
 
22
  scale_height = raw_height / height
23
  scale_width = raw_width / width
24
 
25
+ # bs > 1, batch decoding sample
26
+ inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
27
+ height, width = inputs.pop("height"), inputs.pop("width")
28
+ raw_width, raw_height = image.size
29
+ scale_height = raw_height / height[0]
30
+ scale_width = raw_width / width[0]
31
 
32
  inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
33
  inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
output.png CHANGED

Git LFS Details

  • SHA256: 1cfde9c32f97383e601620280b60787bae1472010e8b5fe4d92a445fde0df6cd
  • Pointer size: 132 Bytes
  • Size of remote file: 1.66 MB

Git LFS Details

  • SHA256: 410b17e2b48d588c7bd9317e924e69841c0b9670848fe0efa217389d74882d32
  • Pointer size: 132 Bytes
  • Size of remote file: 1.66 MB