gaunernst commited on
Commit
aa7e4bb
·
1 Parent(s): 92f36b4

change pipeline to manual model

Browse files
Files changed (1) hide show
  1. app.py +35 -6
app.py CHANGED
@@ -1,12 +1,14 @@
1
  import cv2
2
  import gradio as gr
3
  import numpy as np
 
4
  from paddleocr import PaddleOCR
5
  from PIL import Image
6
- from transformers import pipeline
7
  from transformers.pipelines.document_question_answering import apply_tesseract
8
 
9
- PIPE = pipeline("document-question-answering", "impira/layoutlm-document-qa")
 
10
  OCR = PaddleOCR(
11
  use_angle_cls=True,
12
  lang="en",
@@ -52,9 +54,34 @@ def predict(image: Image.Image, question: str, ocr_engine: str):
52
  else:
53
  raise ValueError(f"Unsupported ocr_engine={ocr_engine}")
54
 
55
- word_boxes = list(zip(words, boxes))
56
- result = PIPE(image, question, word_boxes)[0]
57
- return result["answer"], result["score"], image_np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  gr.Interface(
@@ -66,7 +93,9 @@ gr.Interface(
66
  ],
67
  outputs=[
68
  gr.Textbox(label="Answer"),
69
- gr.Number(label="Score"),
 
70
  gr.Image(label="OCR results"),
71
  ],
 
72
  ).launch(server_name="0.0.0.0", server_port=7860)
 
1
  import cv2
2
  import gradio as gr
3
  import numpy as np
4
+ import torch
5
  from paddleocr import PaddleOCR
6
  from PIL import Image
7
+ from transformers import AutoTokenizer, LayoutLMForQuestionAnswering
8
  from transformers.pipelines.document_question_answering import apply_tesseract
9
 
10
+ MODEL = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa").eval()
11
+ TOKENIZER = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa")
12
  OCR = PaddleOCR(
13
  use_angle_cls=True,
14
  lang="en",
 
54
  else:
55
  raise ValueError(f"Unsupported ocr_engine={ocr_engine}")
56
 
57
+ token_ids = TOKENIZER(question)["input_ids"]
58
+ token_boxes = [[0] * 4] * (len(token_ids) - 1) + [[1000] * 4]
59
+
60
+ token_ids.append(TOKENIZER.sep_token_id)
61
+ token_boxes.append([1000] * 4)
62
+
63
+ for word, box in zip(words, boxes):
64
+ new_ids = TOKENIZER(word, add_special_tokens=False)["input_ids"]
65
+ token_ids.extend(new_ids)
66
+ token_boxes.extend([box] * len(new_ids))
67
+
68
+ token_ids.append(TOKENIZER.sep_token_id)
69
+ token_boxes.append([1000] * 4)
70
+
71
+ with torch.inference_mode():
72
+ outputs = MODEL(
73
+ input_ids=torch.tensor(token_ids).unsqueeze(0),
74
+ bbox=torch.tensor(token_boxes).unsqueeze(0),
75
+ )
76
+
77
+ start_scores = outputs.start_logits.squeeze(0).softmax(-1)
78
+ end_scores = outputs.end_logits.squeeze(0).softmax(-1)
79
+
80
+ start_score, start_idx = start_scores.max(-1)
81
+ end_score, end_idx = end_scores.max(-1)
82
+ answer = TOKENIZER.decode(token_ids[start_idx : end_idx + 1])
83
+
84
+ return answer, start_score, end_score, image_np
85
 
86
 
87
  gr.Interface(
 
93
  ],
94
  outputs=[
95
  gr.Textbox(label="Answer"),
96
+ gr.Number(label="Start score"),
97
+ gr.Number(label="End score"),
98
  gr.Image(label="OCR results"),
99
  ],
100
+ examples=[["example_01.jpg", "When did the sample take place?", PADDLE_OCR_LABEL]],
101
  ).launch(server_name="0.0.0.0", server_port=7860)