Sage
commited on
Commit
Β·
18626e5
1
Parent(s):
cfb190d
error handling, UI changes, and Logs
Browse files- ai_functions.py +12 -5
- app.py +72 -25
- assets/logo.png +0 -0
- css.py +11 -0
- helpers.py +43 -6
- ocr_functions.py +8 -1
- RPFAAP1.json β templates/RPFAAP1.json +1 -1
- RPFAAP2.json β templates/RPFAAP2.json +0 -0
- TDRP.json β templates/TDRP.json +0 -0
ai_functions.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
-
from settings import gpt_model, RPFAAP2, RPFAAP1, TDRP, TDRP_COORDS
|
2 |
import openai
|
3 |
import json
|
4 |
import logging
|
5 |
-
|
6 |
import os
|
|
|
|
|
|
|
7 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
8 |
|
9 |
def chat_gpt_image(content, context):
|
10 |
openai.api_key = os.environ['GPT_API_KEY']
|
@@ -20,6 +23,7 @@ def chat_gpt_image(content, context):
|
|
20 |
|
21 |
final_content = (" ".join(sequence))
|
22 |
logging.info(final_content)
|
|
|
23 |
completion = openai.ChatCompletion.create(
|
24 |
model=gpt_model,
|
25 |
user="1",
|
@@ -29,6 +33,7 @@ def chat_gpt_image(content, context):
|
|
29 |
]
|
30 |
)
|
31 |
logging.info(completion.choices[0].message.content)
|
|
|
32 |
return(completion.choices[0].message.content)
|
33 |
|
34 |
def chat_gpt_document(content, document_type, context):
|
@@ -43,17 +48,17 @@ def chat_gpt_document(content, document_type, context):
|
|
43 |
content_name = content[3]
|
44 |
|
45 |
if document_type == "RPFAA Building P1":
|
46 |
-
document = "RPFAAP1.json"
|
47 |
desired_format = RPFAAP1
|
48 |
tables = [3]
|
49 |
input_coords = TDRP_COORDS
|
50 |
elif document_type == "RPFAA Building P2":
|
51 |
-
document = "RPFAAP2.json"
|
52 |
desired_format = RPFAAP2
|
53 |
tables = []
|
54 |
input_coords = TDRP_COORDS
|
55 |
elif document_type == "TDRP":
|
56 |
-
document = "TDRP.json"
|
57 |
desired_format = TDRP
|
58 |
tables = [0]
|
59 |
input_coords = TDRP_COORDS
|
@@ -68,6 +73,7 @@ def chat_gpt_document(content, document_type, context):
|
|
68 |
|
69 |
content_1 = (" ".join(sequence_1))
|
70 |
logging.info(content_1)
|
|
|
71 |
|
72 |
completion_1 = openai.ChatCompletion.create(
|
73 |
model=gpt_model,
|
@@ -78,6 +84,7 @@ def chat_gpt_document(content, document_type, context):
|
|
78 |
]
|
79 |
)
|
80 |
logging.info(completion_1.choices[0].message.content)
|
|
|
81 |
input_string = remove_na(completion_1.choices[0].message.content)
|
82 |
input_string = merge_strings(input_string,input_coords,document_content)
|
83 |
|
|
|
|
|
1 |
import openai
|
2 |
import json
|
3 |
import logging
|
4 |
+
import sys
|
5 |
import os
|
6 |
+
from settings import gpt_model, RPFAAP2, RPFAAP1, TDRP, TDRP_COORDS
|
7 |
+
from helpers import remove_na, filter_tables, merge_strings, Logger
|
8 |
+
|
9 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
+
sys.stdout = Logger("output.log")
|
11 |
|
12 |
def chat_gpt_image(content, context):
|
13 |
openai.api_key = os.environ['GPT_API_KEY']
|
|
|
23 |
|
24 |
final_content = (" ".join(sequence))
|
25 |
logging.info(final_content)
|
26 |
+
print(final_content)
|
27 |
completion = openai.ChatCompletion.create(
|
28 |
model=gpt_model,
|
29 |
user="1",
|
|
|
33 |
]
|
34 |
)
|
35 |
logging.info(completion.choices[0].message.content)
|
36 |
+
print(completion.choices[0].message.content)
|
37 |
return(completion.choices[0].message.content)
|
38 |
|
39 |
def chat_gpt_document(content, document_type, context):
|
|
|
48 |
content_name = content[3]
|
49 |
|
50 |
if document_type == "RPFAA Building P1":
|
51 |
+
document = "./templates/RPFAAP1.json"
|
52 |
desired_format = RPFAAP1
|
53 |
tables = [3]
|
54 |
input_coords = TDRP_COORDS
|
55 |
elif document_type == "RPFAA Building P2":
|
56 |
+
document = "./templates/RPFAAP2.json"
|
57 |
desired_format = RPFAAP2
|
58 |
tables = []
|
59 |
input_coords = TDRP_COORDS
|
60 |
elif document_type == "TDRP":
|
61 |
+
document = "./templates/TDRP.json"
|
62 |
desired_format = TDRP
|
63 |
tables = [0]
|
64 |
input_coords = TDRP_COORDS
|
|
|
73 |
|
74 |
content_1 = (" ".join(sequence_1))
|
75 |
logging.info(content_1)
|
76 |
+
print(content_1)
|
77 |
|
78 |
completion_1 = openai.ChatCompletion.create(
|
79 |
model=gpt_model,
|
|
|
84 |
]
|
85 |
)
|
86 |
logging.info(completion_1.choices[0].message.content)
|
87 |
+
print(completion_1.choices[0].message.content)
|
88 |
input_string = remove_na(completion_1.choices[0].message.content)
|
89 |
input_string = merge_strings(input_string,input_coords,document_content)
|
90 |
|
app.py
CHANGED
@@ -2,16 +2,18 @@ import openai
|
|
2 |
import gradio as gr
|
3 |
import json
|
4 |
import time
|
5 |
-
from tqdm import tqdm
|
6 |
-
from azure.core.exceptions import HttpResponseError
|
7 |
import logging
|
8 |
import requests
|
9 |
-
import
|
|
|
|
|
10 |
from ocr_functions import detect_document, detect_image
|
11 |
from ai_functions import chat_gpt_document, chat_gpt_image
|
12 |
-
from helpers import save_json
|
|
|
13 |
|
14 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
15 |
|
16 |
def retry_unprocessed_documents():
|
17 |
global global_document_type
|
@@ -19,12 +21,15 @@ def retry_unprocessed_documents():
|
|
19 |
global unprocessed_documents
|
20 |
if unprocessed_documents:
|
21 |
output = batch_document(unprocessed_documents, global_document_type, global_context, "None")
|
22 |
-
unprocessed_documents = []
|
23 |
return output
|
24 |
else:
|
25 |
-
|
26 |
-
return save_json("No Unprocessed Documents", "No Unprocessed Documents")
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
def combine_json_files(json_files, progress=gr.Progress()):
|
29 |
combined_data = []
|
30 |
progress(0, desc="Starting")
|
@@ -33,19 +38,21 @@ def combine_json_files(json_files, progress=gr.Progress()):
|
|
33 |
data = json.load(json_file)
|
34 |
combined_data.extend(data)
|
35 |
logging.info("Combined JSON File: ", combined_data)
|
|
|
36 |
return save_json(combined_data, "Combined Json")
|
37 |
|
38 |
unprocessed_documents = []
|
39 |
global_document_type = None
|
40 |
global_context = None
|
41 |
def batch_document(content, document_type, context, progress = gr.Progress()):
|
42 |
-
logging.info(content)
|
43 |
combined_data = []
|
44 |
global global_document_type
|
45 |
global global_context
|
46 |
global_document_type = document_type
|
47 |
global_context = context
|
48 |
|
|
|
|
|
49 |
if progress == "None":
|
50 |
for x in content:
|
51 |
retries = 3
|
@@ -55,17 +62,23 @@ def batch_document(content, document_type, context, progress = gr.Progress()):
|
|
55 |
try:
|
56 |
data = json.loads(chat_gpt_document(detect_document(x),document_type,context))
|
57 |
combined_data.append(data)
|
58 |
-
i = 0
|
59 |
break
|
60 |
except (openai.error.APIConnectionError, openai.error.AuthenticationError, openai.error.RateLimitError, HttpResponseError, requests.exceptions.RequestException) as e:
|
61 |
logging.error(f'Retry {i+1} failed: {e}')
|
|
|
62 |
if i < retries - 1:
|
63 |
logging.error(f'Retrying in {timeout} seconds...')
|
|
|
64 |
time.sleep(timeout)
|
65 |
i += 1
|
66 |
else:
|
67 |
-
|
68 |
break
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
else:
|
71 |
progress(0, desc="Starting")
|
@@ -77,23 +90,42 @@ def batch_document(content, document_type, context, progress = gr.Progress()):
|
|
77 |
try:
|
78 |
data = json.loads(chat_gpt_document(detect_document(x),document_type,context))
|
79 |
combined_data.append(data)
|
80 |
-
i = 0
|
81 |
break
|
82 |
except (openai.error.APIConnectionError, openai.error.AuthenticationError, openai.error.RateLimitError, HttpResponseError, requests.exceptions.RequestException) as e:
|
83 |
logging.error(f'Retry {i+1} failed: {e}')
|
|
|
84 |
if i < retries - 1:
|
85 |
logging.error(f'Retrying in {timeout} seconds...')
|
|
|
86 |
time.sleep(timeout)
|
87 |
i += 1
|
88 |
else:
|
89 |
unprocessed_documents.append(x)
|
90 |
break
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
logging.info(combined_data)
|
93 |
-
|
|
|
94 |
if document_type == "":
|
95 |
document_type = "error"
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def image(content, context):
|
99 |
retries = 3
|
@@ -105,8 +137,10 @@ def image(content, context):
|
|
105 |
break
|
106 |
except (openai.error.APIConnectionError, openai.error.AuthenticationError, openai.error.RateLimitError, HttpResponseError, requests.exceptions.RequestException) as e:
|
107 |
logging.error(f'Retry {i+1} failed: {e}')
|
|
|
108 |
if i < retries - 1:
|
109 |
logging.error(f'Retrying in {timeout} seconds...')
|
|
|
110 |
time.sleep(timeout)
|
111 |
i += 1
|
112 |
else:
|
@@ -129,9 +163,13 @@ def document(content, document_type, context):
|
|
129 |
i += 1
|
130 |
else:
|
131 |
break
|
|
|
|
|
|
|
|
|
132 |
return data
|
133 |
|
134 |
-
with gr.Blocks(title="Axon OCR", css=
|
135 |
gr.Markdown("""# Axon OCR
|
136 |
Attach Images or Files below and convert them to Text.""", elem_classes="markdown")
|
137 |
with gr.Tab("Scan Image"):
|
@@ -140,7 +178,7 @@ with gr.Blocks(title="Axon OCR", css=".markdown {text-align: center;}") as app:
|
|
140 |
image_input = [gr.Image(type="pil"),
|
141 |
gr.Textbox(label="What kind of Image is this? (Optional)", placeholder="This is an image of an Official Reciept")]
|
142 |
image_output = gr.Textbox(label="Result")
|
143 |
-
image_button = gr.Button("Scan")
|
144 |
with gr.Tab("Scan Document"):
|
145 |
with gr.Row():
|
146 |
with gr.Column():
|
@@ -148,33 +186,42 @@ with gr.Blocks(title="Axon OCR", css=".markdown {text-align: center;}") as app:
|
|
148 |
gr.Dropdown(["RPFAA Building P1", "RPFAA Building P2", "TDRP"], label="File Type", info="What type of document is this?"),
|
149 |
gr.Textbox(label="Any additional information? (Optional)", placeholder="This is document is an Official Reciept")]
|
150 |
document_output = gr.Textbox(label="Result")
|
151 |
-
document_button = gr.Button("Scan")
|
152 |
with gr.Tab("Batch Scan"):
|
153 |
with gr.Row():
|
154 |
with gr.Column():
|
155 |
batch_document_input = [gr.File(file_types=["pdf","tiff","image","text"], file_count="multiple"),
|
156 |
gr.Dropdown(["RPFAA Building P1", "RPFAA Building P2", "TDRP"], label="File Type", info="What type of document is this?"),
|
157 |
gr.Textbox(label="Any additional information? (Optional)", placeholder="This is document is an Official Reciept")]
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
160 |
with gr.Row():
|
161 |
with gr.Column():
|
162 |
-
retry_button = gr.Button("Retry Unprocessed Documents"
|
163 |
with gr.Column():
|
164 |
-
stop_button = gr.Button("Stop Processing Document",
|
165 |
with gr.Tab("Combine JSON"):
|
166 |
with gr.Row():
|
167 |
with gr.Column():
|
168 |
json_files_input = gr.File(file_types=[".json"], file_count="multiple", label='Upload JSON files')
|
169 |
combined_json_output = gr.File(label="Result")
|
170 |
-
combine_button = gr.Button('Combine JSON files')
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
172 |
image_button.click(image, inputs=image_input, outputs=image_output)
|
173 |
document_button.click(document, inputs=document_input, outputs=document_output)
|
174 |
-
batch_document_event = batch_document_button.click(batch_document, inputs=batch_document_input, outputs=batch_document_output)
|
175 |
-
retry_button.click(retry_unprocessed_documents, outputs=batch_document_output)
|
176 |
stop_button.click(fn=None, inputs=None, outputs=None, cancels=[batch_document_event])
|
177 |
combine_button.click(combine_json_files, inputs=json_files_input, outputs=combined_json_output)
|
178 |
|
179 |
app.queue()
|
180 |
-
app.launch(auth=("username", "password"))
|
|
|
2 |
import gradio as gr
|
3 |
import json
|
4 |
import time
|
|
|
|
|
5 |
import logging
|
6 |
import requests
|
7 |
+
import sys
|
8 |
+
from tqdm import tqdm
|
9 |
+
from azure.core.exceptions import HttpResponseError
|
10 |
from ocr_functions import detect_document, detect_image
|
11 |
from ai_functions import chat_gpt_document, chat_gpt_image
|
12 |
+
from helpers import save_json, read_logs, clear_logs, Logger
|
13 |
+
from css import css
|
14 |
|
15 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
16 |
+
sys.stdout = Logger("output.log")
|
17 |
|
18 |
def retry_unprocessed_documents():
|
19 |
global global_document_type
|
|
|
21 |
global unprocessed_documents
|
22 |
if unprocessed_documents:
|
23 |
output = batch_document(unprocessed_documents, global_document_type, global_context, "None")
|
|
|
24 |
return output
|
25 |
else:
|
26 |
+
return save_json("No Unprocessed Documents", "No Unprocessed Documents"), "All Documents Processed"
|
|
|
27 |
|
28 |
+
def clear_unprocessed_documents():
|
29 |
+
global unprocessed_documents
|
30 |
+
unprocessed_documents = []
|
31 |
+
return "All Documents Processed"
|
32 |
+
|
33 |
def combine_json_files(json_files, progress=gr.Progress()):
|
34 |
combined_data = []
|
35 |
progress(0, desc="Starting")
|
|
|
38 |
data = json.load(json_file)
|
39 |
combined_data.extend(data)
|
40 |
logging.info("Combined JSON File: ", combined_data)
|
41 |
+
print("Combined JSON File: ", combined_data)
|
42 |
return save_json(combined_data, "Combined Json")
|
43 |
|
44 |
unprocessed_documents = []
|
45 |
global_document_type = None
|
46 |
global_context = None
|
47 |
def batch_document(content, document_type, context, progress = gr.Progress()):
|
|
|
48 |
combined_data = []
|
49 |
global global_document_type
|
50 |
global global_context
|
51 |
global_document_type = document_type
|
52 |
global_context = context
|
53 |
|
54 |
+
unprocessed_docs_temp = []
|
55 |
+
|
56 |
if progress == "None":
|
57 |
for x in content:
|
58 |
retries = 3
|
|
|
62 |
try:
|
63 |
data = json.loads(chat_gpt_document(detect_document(x),document_type,context))
|
64 |
combined_data.append(data)
|
|
|
65 |
break
|
66 |
except (openai.error.APIConnectionError, openai.error.AuthenticationError, openai.error.RateLimitError, HttpResponseError, requests.exceptions.RequestException) as e:
|
67 |
logging.error(f'Retry {i+1} failed: {e}')
|
68 |
+
print(f'Retry {i+1} failed: {e}')
|
69 |
if i < retries - 1:
|
70 |
logging.error(f'Retrying in {timeout} seconds...')
|
71 |
+
print(f'Retrying in {timeout} seconds...')
|
72 |
time.sleep(timeout)
|
73 |
i += 1
|
74 |
else:
|
75 |
+
unprocessed_docs_temp.append(x)
|
76 |
break
|
77 |
+
except Exception as e: # catch any other exceptions
|
78 |
+
logging.error(f'Unexpected error {e}')
|
79 |
+
print(f'Unexpected error {e}')
|
80 |
+
unprocessed_docs_temp.append(x)
|
81 |
+
break
|
82 |
|
83 |
else:
|
84 |
progress(0, desc="Starting")
|
|
|
90 |
try:
|
91 |
data = json.loads(chat_gpt_document(detect_document(x),document_type,context))
|
92 |
combined_data.append(data)
|
|
|
93 |
break
|
94 |
except (openai.error.APIConnectionError, openai.error.AuthenticationError, openai.error.RateLimitError, HttpResponseError, requests.exceptions.RequestException) as e:
|
95 |
logging.error(f'Retry {i+1} failed: {e}')
|
96 |
+
print(f'Retry {i+1} failed: {e}')
|
97 |
if i < retries - 1:
|
98 |
logging.error(f'Retrying in {timeout} seconds...')
|
99 |
+
print(f'Retrying in {timeout} seconds...')
|
100 |
time.sleep(timeout)
|
101 |
i += 1
|
102 |
else:
|
103 |
unprocessed_documents.append(x)
|
104 |
break
|
105 |
+
except Exception as e: # catch any other exceptions
|
106 |
+
logging.error(f'Unexpected error {e}')
|
107 |
+
print(f'Unexpected error {e}')
|
108 |
+
unprocessed_documents.append(x)
|
109 |
+
break
|
110 |
|
111 |
logging.info(combined_data)
|
112 |
+
print(combined_data)
|
113 |
+
|
114 |
if document_type == "":
|
115 |
document_type = "error"
|
116 |
+
|
117 |
+
if unprocessed_documents:
|
118 |
+
unprocessed = "\n".join([doc.name.split('\\')[-1].split('/')[-1].split('.')[0] for doc in unprocessed_documents])
|
119 |
+
logging.info(unprocessed)
|
120 |
+
print(unprocessed)
|
121 |
+
elif unprocessed_docs_temp:
|
122 |
+
unprocessed_documents.extend(unprocessed_docs_temp)
|
123 |
+
unprocessed = "\n".join([doc.name.split('\\')[-1].split('/')[-1].split('.')[0] for doc in unprocessed_documents])
|
124 |
+
logging.info(unprocessed)
|
125 |
+
print(unprocessed)
|
126 |
+
else:
|
127 |
+
unprocessed = "All Documents Processed"
|
128 |
+
return save_json(combined_data, document_type), unprocessed
|
129 |
|
130 |
def image(content, context):
|
131 |
retries = 3
|
|
|
137 |
break
|
138 |
except (openai.error.APIConnectionError, openai.error.AuthenticationError, openai.error.RateLimitError, HttpResponseError, requests.exceptions.RequestException) as e:
|
139 |
logging.error(f'Retry {i+1} failed: {e}')
|
140 |
+
print(f'Retry {i+1} failed: {e}')
|
141 |
if i < retries - 1:
|
142 |
logging.error(f'Retrying in {timeout} seconds...')
|
143 |
+
print(f'Retrying in {timeout} seconds...')
|
144 |
time.sleep(timeout)
|
145 |
i += 1
|
146 |
else:
|
|
|
163 |
i += 1
|
164 |
else:
|
165 |
break
|
166 |
+
except Exception as e: # catch any other exceptions
|
167 |
+
logging.error(f'Unexpected error {e}')
|
168 |
+
print(f'Unexpected error {e}')
|
169 |
+
break
|
170 |
return data
|
171 |
|
172 |
+
with gr.Blocks(title="Axon OCR", css=css) as app:
|
173 |
gr.Markdown("""# Axon OCR
|
174 |
Attach Images or Files below and convert them to Text.""", elem_classes="markdown")
|
175 |
with gr.Tab("Scan Image"):
|
|
|
178 |
image_input = [gr.Image(type="pil"),
|
179 |
gr.Textbox(label="What kind of Image is this? (Optional)", placeholder="This is an image of an Official Reciept")]
|
180 |
image_output = gr.Textbox(label="Result")
|
181 |
+
image_button = gr.Button("Scan", variant="primary")
|
182 |
with gr.Tab("Scan Document"):
|
183 |
with gr.Row():
|
184 |
with gr.Column():
|
|
|
186 |
gr.Dropdown(["RPFAA Building P1", "RPFAA Building P2", "TDRP"], label="File Type", info="What type of document is this?"),
|
187 |
gr.Textbox(label="Any additional information? (Optional)", placeholder="This is document is an Official Reciept")]
|
188 |
document_output = gr.Textbox(label="Result")
|
189 |
+
document_button = gr.Button("Scan", variant="primary")
|
190 |
with gr.Tab("Batch Scan"):
|
191 |
with gr.Row():
|
192 |
with gr.Column():
|
193 |
batch_document_input = [gr.File(file_types=["pdf","tiff","image","text"], file_count="multiple"),
|
194 |
gr.Dropdown(["RPFAA Building P1", "RPFAA Building P2", "TDRP"], label="File Type", info="What type of document is this?"),
|
195 |
gr.Textbox(label="Any additional information? (Optional)", placeholder="This is document is an Official Reciept")]
|
196 |
+
with gr.Column():
|
197 |
+
batch_document_output = gr.File(label="Result")
|
198 |
+
with gr.Accordion("Unprocessed Documents", open=False):
|
199 |
+
batch_unprocessed = gr.Textbox(info="Download the file before retrying Unprocessed Documents and clear unprocessed documents after every scan to avoid overlaps", show_label=False, elem_classes="unprocessed_textbox")
|
200 |
+
clear_unprocessed_button = gr.Button("Clear Unprocessed Documents")
|
201 |
+
batch_document_button = gr.Button("Scan", variant="primary")
|
202 |
with gr.Row():
|
203 |
with gr.Column():
|
204 |
+
retry_button = gr.Button("Retry Unprocessed Documents")
|
205 |
with gr.Column():
|
206 |
+
stop_button = gr.Button("Stop Processing Document", variant="stop")
|
207 |
with gr.Tab("Combine JSON"):
|
208 |
with gr.Row():
|
209 |
with gr.Column():
|
210 |
json_files_input = gr.File(file_types=[".json"], file_count="multiple", label='Upload JSON files')
|
211 |
combined_json_output = gr.File(label="Result")
|
212 |
+
combine_button = gr.Button('Combine JSON files', variant="primary")
|
213 |
+
with gr.Accordion("Logs", open=False):
|
214 |
+
logs = gr.Textbox(max_lines=10, show_label=False, elem_classes="log_textbox")
|
215 |
+
app.load(read_logs, None, logs, every=1)
|
216 |
+
clear_button = gr.Button("Clear Logs")
|
217 |
+
clear_button.click(clear_logs)
|
218 |
+
clear_unprocessed_button.click(clear_unprocessed_documents, outputs=batch_unprocessed)
|
219 |
image_button.click(image, inputs=image_input, outputs=image_output)
|
220 |
document_button.click(document, inputs=document_input, outputs=document_output)
|
221 |
+
batch_document_event = batch_document_button.click(batch_document, inputs=batch_document_input, outputs=[batch_document_output,batch_unprocessed])
|
222 |
+
retry_button.click(retry_unprocessed_documents, outputs=[batch_document_output,batch_unprocessed])
|
223 |
stop_button.click(fn=None, inputs=None, outputs=None, cancels=[batch_document_event])
|
224 |
combine_button.click(combine_json_files, inputs=json_files_input, outputs=combined_json_output)
|
225 |
|
226 |
app.queue()
|
227 |
+
app.launch(auth=("username", "password"), favicon_path="assets/logo.png")
|
assets/logo.png
ADDED
![]() |
css.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = """
|
2 |
+
.log_textbox textarea {
|
3 |
+
height: 300px !important;
|
4 |
+
}
|
5 |
+
.markdown {
|
6 |
+
text-align: center;
|
7 |
+
}
|
8 |
+
.unprocessed_textbox textarea {
|
9 |
+
height: 100px !important;
|
10 |
+
}
|
11 |
+
"""
|
helpers.py
CHANGED
@@ -1,7 +1,26 @@
|
|
1 |
from settings import char_remove
|
2 |
import re
|
3 |
import json
|
|
|
4 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
6 |
|
7 |
def remove_na(string):
|
@@ -29,8 +48,12 @@ def filter_tables(input_string, table_numbers):
|
|
29 |
|
30 |
for table_number in table_numbers:
|
31 |
# Picking the specific table
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
# Extracting cell coordinates and contents
|
35 |
cells = re.findall(r"Cell\[(\d+)\]\[(\d+)\] has content '(.*?)'", table_str)
|
36 |
|
@@ -78,10 +101,14 @@ def merge_strings(input_string, input_coords, extract_coords):
|
|
78 |
# Filter out empty lines and strip leading/trailing whitespaces
|
79 |
lines2 = [line.strip() for line in lines2 if line.strip()]
|
80 |
|
81 |
-
logging.info(lines2)
|
82 |
# Creating dictionaries to store the key-value pairs
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# Updating the values in dict1 with the ones from dict2 if they share the same key
|
87 |
for key in dict1.keys():
|
@@ -103,4 +130,14 @@ def merge_strings(input_string, input_coords, extract_coords):
|
|
103 |
# Constructing the updated string1
|
104 |
input_string = '\n'.join([f"{key}: {value}" for key, value in dict1.items()])
|
105 |
|
106 |
-
return input_string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from settings import char_remove
|
2 |
import re
|
3 |
import json
|
4 |
+
import sys
|
5 |
import logging
|
6 |
+
|
7 |
+
class Logger:
|
8 |
+
def __init__(self, filename):
|
9 |
+
self.terminal = sys.stdout
|
10 |
+
self.log = open(filename, "w")
|
11 |
+
|
12 |
+
def write(self, message):
|
13 |
+
self.terminal.write(message)
|
14 |
+
self.log.write(message)
|
15 |
+
|
16 |
+
def flush(self):
|
17 |
+
self.terminal.flush()
|
18 |
+
self.log.flush()
|
19 |
+
|
20 |
+
def isatty(self):
|
21 |
+
return False
|
22 |
+
|
23 |
+
sys.stdout = Logger("output.log")
|
24 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
25 |
|
26 |
def remove_na(string):
|
|
|
48 |
|
49 |
for table_number in table_numbers:
|
50 |
# Picking the specific table
|
51 |
+
try:
|
52 |
+
table_str = tables[table_number]
|
53 |
+
except (IndexError, UnboundLocalError) as e:
|
54 |
+
logging.error(f"Error: {e}, Please check document configuration or document type")
|
55 |
+
print(f"Error: {e}, Please check document configuration or document type")
|
56 |
+
raise e
|
57 |
# Extracting cell coordinates and contents
|
58 |
cells = re.findall(r"Cell\[(\d+)\]\[(\d+)\] has content '(.*?)'", table_str)
|
59 |
|
|
|
101 |
# Filter out empty lines and strip leading/trailing whitespaces
|
102 |
lines2 = [line.strip() for line in lines2 if line.strip()]
|
103 |
|
|
|
104 |
# Creating dictionaries to store the key-value pairs
|
105 |
+
try:
|
106 |
+
dict1 = {line.split(": ")[0]: line.split(": ")[1] for line in lines1}
|
107 |
+
dict2 = {line.split(": ")[0]: line.split(": ")[1] for line in lines2}
|
108 |
+
except (IndexError, UnboundLocalError) as e:
|
109 |
+
logging.error(f"Error: {e}, Please check document configuration or document type")
|
110 |
+
print(f"Error: {e}, Please check document configuration or document type")
|
111 |
+
raise e
|
112 |
|
113 |
# Updating the values in dict1 with the ones from dict2 if they share the same key
|
114 |
for key in dict1.keys():
|
|
|
130 |
# Constructing the updated string1
|
131 |
input_string = '\n'.join([f"{key}: {value}" for key, value in dict1.items()])
|
132 |
|
133 |
+
return input_string
|
134 |
+
|
135 |
+
def read_logs():
|
136 |
+
sys.stdout.flush()
|
137 |
+
with open("output.log","r",encoding="utf-8") as f:
|
138 |
+
lines = f.readlines()
|
139 |
+
return ''.join(lines[-100:])
|
140 |
+
|
141 |
+
def clear_logs():
|
142 |
+
with open("output.log","w",encoding="utf-8") as f:
|
143 |
+
f.write("")
|
ocr_functions.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
from azure.core.credentials import AzureKeyCredential
|
2 |
from azure.ai.formrecognizer import DocumentAnalysisClient
|
3 |
from io import BytesIO
|
4 |
-
from helpers import format_polygon
|
5 |
import logging
|
6 |
import os
|
|
|
7 |
|
8 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
9 |
endpoint = os.environ['AZURE_API_ENDPOINT']
|
10 |
key = os.environ['AZURE_API_KEY']
|
11 |
|
@@ -28,6 +31,7 @@ def detect_document(content):
|
|
28 |
kv_pair.value.content
|
29 |
)
|
30 |
logging.info(pair_content)
|
|
|
31 |
document_content = "----Lines found in document----\n"
|
32 |
for page in result.pages:
|
33 |
for line_idx, line in enumerate(page.lines):
|
@@ -37,6 +41,7 @@ def detect_document(content):
|
|
37 |
format_polygon(line.polygon),
|
38 |
)
|
39 |
logging.info(document_content)
|
|
|
40 |
table_content = "----Tables found in document----\n"
|
41 |
for table_idx, table in enumerate(result.tables):
|
42 |
table_content += "Table # {} has {} rows and {} columns\n".format(
|
@@ -49,6 +54,7 @@ def detect_document(content):
|
|
49 |
cell.content,
|
50 |
)
|
51 |
logging.info(table_content)
|
|
|
52 |
name = content.name.split('\\')[-1]
|
53 |
name = name.split("/")[-1]
|
54 |
name = name.split('.')[0]
|
@@ -67,4 +73,5 @@ def detect_image(content):
|
|
67 |
|
68 |
result = poller.result()
|
69 |
logging.info(result.content)
|
|
|
70 |
return(result.content)
|
|
|
1 |
from azure.core.credentials import AzureKeyCredential
|
2 |
from azure.ai.formrecognizer import DocumentAnalysisClient
|
3 |
from io import BytesIO
|
4 |
+
from helpers import format_polygon, Logger
|
5 |
import logging
|
6 |
import os
|
7 |
+
import sys
|
8 |
|
9 |
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
+
sys.stdout = Logger("output.log")
|
11 |
+
|
12 |
endpoint = os.environ['AZURE_API_ENDPOINT']
|
13 |
key = os.environ['AZURE_API_KEY']
|
14 |
|
|
|
31 |
kv_pair.value.content
|
32 |
)
|
33 |
logging.info(pair_content)
|
34 |
+
print(pair_content)
|
35 |
document_content = "----Lines found in document----\n"
|
36 |
for page in result.pages:
|
37 |
for line_idx, line in enumerate(page.lines):
|
|
|
41 |
format_polygon(line.polygon),
|
42 |
)
|
43 |
logging.info(document_content)
|
44 |
+
print(document_content)
|
45 |
table_content = "----Tables found in document----\n"
|
46 |
for table_idx, table in enumerate(result.tables):
|
47 |
table_content += "Table # {} has {} rows and {} columns\n".format(
|
|
|
54 |
cell.content,
|
55 |
)
|
56 |
logging.info(table_content)
|
57 |
+
print(table_content)
|
58 |
name = content.name.split('\\')[-1]
|
59 |
name = name.split("/")[-1]
|
60 |
name = name.split('.')[0]
|
|
|
73 |
|
74 |
result = poller.result()
|
75 |
logging.info(result.content)
|
76 |
+
print(result.content)
|
77 |
return(result.content)
|
RPFAAP1.json β templates/RPFAAP1.json
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
{"File Name": "%s",
|
2 |
"General Information": {
|
3 |
"ARP No.": "",
|
4 |
-
"
|
5 |
"Address": "",
|
6 |
"Tel No.": "",
|
7 |
"Administrator/Beneficial User": "",
|
|
|
1 |
{"File Name": "%s",
|
2 |
"General Information": {
|
3 |
"ARP No.": "",
|
4 |
+
"OWNER": "",
|
5 |
"Address": "",
|
6 |
"Tel No.": "",
|
7 |
"Administrator/Beneficial User": "",
|
RPFAAP2.json β templates/RPFAAP2.json
RENAMED
File without changes
|
TDRP.json β templates/TDRP.json
RENAMED
File without changes
|