File size: 1,480 Bytes
e5bfa19 47d96e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
---
language:
- en
library_name: transformers
tags:
- donut
- donut-python
---
### Installtion
```bash
pip install torch
pip install transformers==4.11.3
pip install opencv-python==4.6.0.66
pip install donut-python
```
### Usage
```python
import sys
import os
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
import re
from donut import DonutModel
import torch
from PIL import Image
en_model_path = "question_generator_by_en_on_pic"
task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
en_pretrained_model = DonutModel.from_pretrained(en_model_path)
if torch.cuda.is_available():
en_pretrained_model.half()
device = torch.device("cuda")
en_pretrained_model.to(device)
en_pretrained_model.eval()
print("have load !")
def demo_process_vqa(input_img, question):
#input_img = Image.fromarray(input_img)
global en_pretrained_model, task_prompt
user_prompt = task_prompt.replace("{user_input}", question)
output = en_pretrained_model.inference(input_img, prompt=user_prompt)["predictions"][0]
req = {
"question": output["answer"],
"answer": output["question"]
}
return req
img_path = "en_img.png"
demo_process_vqa(Image.open(img_path), "605-7227", "en")
'''
{
"question": "What is the Phone #?",
"answer": "605-7227"
}
'''
```
### Sample Image
<img src="https://raw.githubusercontent.com/svjack/docvqa-gen/main/imgs/en_img.png" width = "500px" height = "500px"/> |