Spaces:
Runtime error
Runtime error
upgrade to GPU; move to taskflow
Browse files- app.py +57 -31
- requirements.txt +5 -4
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
#-*- coding: UTF-8 -*-
|
2 |
-
# Copyright 2022
|
3 |
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
4 |
#
|
5 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -14,21 +14,17 @@
|
|
14 |
# See the License for the specific language governing permissions and
|
15 |
# limitations under the License.
|
16 |
|
17 |
-
import os
|
18 |
-
import json
|
19 |
import base64
|
20 |
-
from io import BytesIO
|
21 |
-
from PIL import Image
|
22 |
import traceback
|
23 |
|
24 |
-
import requests
|
25 |
-
import numpy as np
|
26 |
import gradio as gr
|
27 |
import cv2
|
28 |
|
|
|
29 |
from paddlenlp.utils.doc_parser import DocParser
|
30 |
|
31 |
doc_parser = DocParser()
|
|
|
32 |
|
33 |
examples = [
|
34 |
[
|
@@ -38,7 +34,7 @@ examples = [
|
|
38 |
[
|
39 |
"license.jpeg",
|
40 |
"Name;DOB;ISS;EXP",
|
41 |
-
|
42 |
[
|
43 |
"invoice.jpeg",
|
44 |
"名称;纳税人识别号;开票日期",
|
@@ -62,11 +58,11 @@ example_files = {
|
|
62 |
}
|
63 |
|
64 |
lang_map = {
|
65 |
-
"resume.png": "ch
|
66 |
-
"custom.jpeg": "ch
|
67 |
-
"business_card.png": "en
|
68 |
-
"invoice.jpeg": "ch
|
69 |
-
"license.jpeg": "en
|
70 |
}
|
71 |
|
72 |
def dbc2sbc(s):
|
@@ -139,43 +135,73 @@ def np2base64(image_np):
|
|
139 |
return base64_str
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
def process_doc(document, schema, ocr_lang, layout_analysis):
|
143 |
if not schema:
|
144 |
schema = '时间;组织机构;人物'
|
145 |
if document is None:
|
146 |
return None, None
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
base64_str = np2base64(doc_parser.read_image(document))
|
156 |
-
|
157 |
-
r = requests.post(url, json={"doc": base64_str, "schema": schema, "option": option})
|
158 |
-
response = r.json()
|
159 |
-
print(response)
|
160 |
-
predictions = response['result']
|
161 |
|
162 |
img_show = doc_parser.write_image_with_results(
|
163 |
-
|
164 |
-
result=
|
165 |
-
max_size=2000,
|
166 |
return_image=True)
|
167 |
img_list = [img_show]
|
168 |
|
169 |
return (
|
170 |
gr.update(visible=True, value=img_list),
|
171 |
-
gr.update(visible=True, value=
|
172 |
)
|
173 |
|
174 |
|
175 |
def load_example_document(img, schema, ocr_lang, layout_analysis):
|
176 |
if img is not None:
|
177 |
document = example_files[schema]
|
178 |
-
|
|
|
|
|
179 |
preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
|
180 |
return document, schema, preview, gr.update(visible=True), answer
|
181 |
else:
|
|
|
1 |
#-*- coding: UTF-8 -*-
|
2 |
+
# Copyright 2022 the HuggingFace Team.
|
3 |
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
4 |
#
|
5 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
14 |
# See the License for the specific language governing permissions and
|
15 |
# limitations under the License.
|
16 |
|
|
|
|
|
17 |
import base64
|
|
|
|
|
18 |
import traceback
|
19 |
|
|
|
|
|
20 |
import gradio as gr
|
21 |
import cv2
|
22 |
|
23 |
+
from paddlenlp import Taskflow
|
24 |
from paddlenlp.utils.doc_parser import DocParser
|
25 |
|
26 |
doc_parser = DocParser()
|
27 |
+
task_instance = Taskflow("information_extraction", model="uie-x-base", schema="")
|
28 |
|
29 |
examples = [
|
30 |
[
|
|
|
34 |
[
|
35 |
"license.jpeg",
|
36 |
"Name;DOB;ISS;EXP",
|
37 |
+
],
|
38 |
[
|
39 |
"invoice.jpeg",
|
40 |
"名称;纳税人识别号;开票日期",
|
|
|
58 |
}
|
59 |
|
60 |
lang_map = {
|
61 |
+
"resume.png": "ch",
|
62 |
+
"custom.jpeg": "ch",
|
63 |
+
"business_card.png": "en",
|
64 |
+
"invoice.jpeg": "ch",
|
65 |
+
"license.jpeg": "en",
|
66 |
}
|
67 |
|
68 |
def dbc2sbc(s):
|
|
|
135 |
return base64_str
|
136 |
|
137 |
|
138 |
+
def get_schema(schema_str):
|
139 |
+
def _is_ch(s):
|
140 |
+
for ch in s:
|
141 |
+
if "\u4e00" <= ch <= "\u9fff":
|
142 |
+
return True
|
143 |
+
return False
|
144 |
+
schema_lang = "ch" if _is_ch(schema_str) else "en"
|
145 |
+
schema = schema_str.split(";")
|
146 |
+
schema_list = []
|
147 |
+
for s in schema:
|
148 |
+
cand = s.split("|")
|
149 |
+
if len(cand) == 1:
|
150 |
+
schema_list.append(cand[0])
|
151 |
+
else:
|
152 |
+
subject = cand[0]
|
153 |
+
relations = cand[1:]
|
154 |
+
added = False
|
155 |
+
for a in schema_list:
|
156 |
+
if isinstance(a, dict):
|
157 |
+
if subject in a.keys():
|
158 |
+
a[subject].extend(relations)
|
159 |
+
added = True
|
160 |
+
break
|
161 |
+
if not added:
|
162 |
+
a = {subject: relations}
|
163 |
+
schema_list.append(a)
|
164 |
+
return schema_list, schema_lang
|
165 |
+
|
166 |
+
|
167 |
+
def run_taskflow(document, schema, argument):
|
168 |
+
task_instance.set_schema(schema)
|
169 |
+
# task_instance.set_argument(argument)
|
170 |
+
return task_instance({'doc': document})
|
171 |
+
|
172 |
+
|
173 |
def process_doc(document, schema, ocr_lang, layout_analysis):
|
174 |
if not schema:
|
175 |
schema = '时间;组织机构;人物'
|
176 |
if document is None:
|
177 |
return None, None
|
178 |
|
179 |
+
schema, schema_lang = get_schema(dbc2sbc(schema))
|
180 |
+
argument = {
|
181 |
+
"ocr_lang": ocr_lang,
|
182 |
+
"schema_lang": schema_lang,
|
183 |
+
"layout_analysis": layout_analysis
|
184 |
+
}
|
185 |
+
prediction = run_taskflow(document, schema, argument)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
img_show = doc_parser.write_image_with_results(
|
188 |
+
document,
|
189 |
+
result=prediction,
|
|
|
190 |
return_image=True)
|
191 |
img_list = [img_show]
|
192 |
|
193 |
return (
|
194 |
gr.update(visible=True, value=img_list),
|
195 |
+
gr.update(visible=True, value=prediction),
|
196 |
)
|
197 |
|
198 |
|
199 |
def load_example_document(img, schema, ocr_lang, layout_analysis):
|
200 |
if img is not None:
|
201 |
document = example_files[schema]
|
202 |
+
choice = lang_map[document].split("-")
|
203 |
+
ocr_lang = choice[0]
|
204 |
+
layout_analysis = False if len(choice) == 1 else True
|
205 |
preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
|
206 |
return document, schema, preview, gr.update(visible=True), answer
|
207 |
else:
|
requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
-
numpy
|
2 |
opencv-python
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
1 |
+
numpy==1.21.6
|
2 |
opencv-python
|
3 |
+
# install from git for now, wait until v2.4.6 release
|
4 |
+
PaddleNLP
|
5 |
+
paddleocr
|
6 |
+
paddlepaddle-gpu
|