Spaces:
Configuration error
Configuration error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!python
|
2 |
+
# # Engineering PDF tag extractor
|
3 |
+
# by Serge Jaumain / SPIE Oil & Gas Services
|
4 |
+
#
|
5 |
+
# 31/05/2023
|
6 |
+
|
7 |
+
# importing required modules
|
8 |
+
import re
|
9 |
+
import pandas as pd
|
10 |
+
import fitz
|
11 |
+
import streamlit as st
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
|
15 |
+
def find_pattern(text, include, exclude, remove):
|
16 |
+
"""Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result
|
17 |
+
|
18 |
+
Args:
|
19 |
+
text (string): Text to be scanned
|
20 |
+
include (string): REGEX expression to extract patterns from text
|
21 |
+
exclude (string): REGEX expression to exclude patterns from search in text
|
22 |
+
remove (string): string to remove from result
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
string: pattern filtered out
|
26 |
+
"""
|
27 |
+
if remove == None:
|
28 |
+
remove = ''
|
29 |
+
if include == None:
|
30 |
+
include = ''
|
31 |
+
find = re.findall(include, re.sub(remove, '', text))
|
32 |
+
if not exclude:
|
33 |
+
filtered = find
|
34 |
+
else:
|
35 |
+
filtered = [el for el in find if re.findall(exclude,el)==[]]
|
36 |
+
clean = filtered
|
37 |
+
#if remove != []:
|
38 |
+
# for txt in remove:
|
39 |
+
# clean = [el.replace(txt, '') for el in clean]
|
40 |
+
return clean
|
41 |
+
|
42 |
+
def get_from_text(doc, include, exclude, remove):
|
43 |
+
"""Retrieves visible layer text from PDF
|
44 |
+
|
45 |
+
Args:
|
46 |
+
doc (fitz document): actual pdf document to extract
|
47 |
+
include (string): contains the regex string of tags to include
|
48 |
+
exclude (string): contains the regex string of tags to exclude
|
49 |
+
remove (string): contains a list of string patterns to remove at the end
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
list: raw list of tags found
|
53 |
+
"""
|
54 |
+
# switch on all layers
|
55 |
+
doc_layers = doc.layer_ui_configs()
|
56 |
+
for doc_layer in doc_layers:
|
57 |
+
doc.set_layer_ui_config(doc_layer['number'], action=0)
|
58 |
+
|
59 |
+
text = '|'.join([page.get_text() for page in doc])
|
60 |
+
|
61 |
+
return find_pattern(text, include, exclude, remove)
|
62 |
+
|
63 |
+
def get_from_toc(doc, include, exclude, remove):
|
64 |
+
"""Retrieves TOC from PDF
|
65 |
+
|
66 |
+
Args:
|
67 |
+
doc (fitz document): actual pdf document to extract
|
68 |
+
include (string): contains the regex string of tags to include
|
69 |
+
exclude (string): contains the regex string of tags to exclude
|
70 |
+
remove (string): contains a list of string patterns to remove at the end
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
list: raw list of tags found
|
74 |
+
"""
|
75 |
+
text = doc.get_toc()
|
76 |
+
|
77 |
+
return find_pattern(text, include, exclude, remove)
|
78 |
+
|
79 |
+
def get_bookmark(doc, bm_text, include, exclude, remove):
|
80 |
+
"""Retrieves the bookmarks from PDF
|
81 |
+
|
82 |
+
Args:
|
83 |
+
doc (fitz document): actual pdf document to extract
|
84 |
+
bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
|
85 |
+
include (string): contains the regex string of tags to include
|
86 |
+
exclude (_type_): contains the regex string of tags to exclude
|
87 |
+
remove (string): contains a list of string patterns to remove at the end
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
list: list like [tag1, tag2, ...]
|
91 |
+
"""
|
92 |
+
items = doc.get_toc()
|
93 |
+
tags = []
|
94 |
+
flag = False
|
95 |
+
for item in items:
|
96 |
+
if bm_text == '$':
|
97 |
+
clean = find_pattern(item[1], include, exclude, remove)
|
98 |
+
tags.extend(clean)
|
99 |
+
else:
|
100 |
+
if item[0] == 1:
|
101 |
+
flag = bm_text.upper() in item[1].upper()
|
102 |
+
else:
|
103 |
+
if flag:
|
104 |
+
clean = find_pattern(item[1], include, exclude, remove)
|
105 |
+
tags.extend(clean)
|
106 |
+
return tags
|
107 |
+
|
108 |
+
def get_layer(doc, layer2search, include, exclude, remove):
|
109 |
+
"""Retrieves visible layer text from PDF
|
110 |
+
|
111 |
+
Args:
|
112 |
+
doc (fitz document): actual pdf document to extract
|
113 |
+
layern (string): contains the layer name of the layer to be extracted
|
114 |
+
include (string): contains the regex string of tags to include
|
115 |
+
exclude (string): contains the regex string of tags to exclude
|
116 |
+
remove (string): contains a list of string patterns to remove at the end
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
list: raw list of tags found
|
120 |
+
"""
|
121 |
+
doc_layers = doc.layer_ui_configs()
|
122 |
+
# swith on all layers if "$" is found somewhere
|
123 |
+
# else switch off all layers not wanted
|
124 |
+
for layersearched in layer2search:
|
125 |
+
if layersearched.strip()[0] == "$":
|
126 |
+
for layer in doc_layers:
|
127 |
+
doc.set_layer_ui_config(layer['number'], action=0)
|
128 |
+
break
|
129 |
+
else:
|
130 |
+
for layer in doc_layers:
|
131 |
+
if layer['text'] in layersearched.strip():
|
132 |
+
doc.set_layer_ui_config(layer['number'], action=0)
|
133 |
+
else:
|
134 |
+
doc.set_layer_ui_config(layer['number'], action=2)
|
135 |
+
|
136 |
+
# get all pages
|
137 |
+
text = '|'.join([page.get_text() for page in doc])
|
138 |
+
|
139 |
+
return find_pattern(text, include, exclude, remove)
|
140 |
+
|
141 |
+
def extract_tag(file, patterns):
|
142 |
+
"""Extracts pattern list <patterns> from <file>
|
143 |
+
|
144 |
+
Args:
|
145 |
+
file (file object): PDF file object to be extracted
|
146 |
+
patterns (list): dictionnary of patterns
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
|
150 |
+
"""
|
151 |
+
# creating a pdf reader object
|
152 |
+
doc = fitz.open(stream=file.read(), filetype='pdf')
|
153 |
+
# go through all patterns to be detected
|
154 |
+
tag_list = []
|
155 |
+
for pattern in patterns:
|
156 |
+
pname = pattern[0].strip()
|
157 |
+
where = pattern[1].strip().upper()
|
158 |
+
label = pattern[2].strip()
|
159 |
+
include = pattern[3]
|
160 |
+
exclude = pattern[4]
|
161 |
+
remove = pattern[5]
|
162 |
+
error_txt = ''
|
163 |
+
if where == "TEXT":
|
164 |
+
tags = get_from_text(doc, include, exclude, remove)
|
165 |
+
elif where == "TOC":
|
166 |
+
tags = get_from_toc(doc, include, exclude, remove)
|
167 |
+
elif where == "BOOKMARK":
|
168 |
+
tags = get_bookmark(doc, label, include, exclude, remove)
|
169 |
+
elif where == "LAYER":
|
170 |
+
tags = get_layer(doc, label, include, exclude, remove)
|
171 |
+
#if len(label) == 1:
|
172 |
+
# tags = get_layer(doc, [], include, exclude, remove)
|
173 |
+
#else:
|
174 |
+
# tags = []
|
175 |
+
# for layer in label:
|
176 |
+
# tags.append(get_layer(doc, layer, include, exclude, remove))
|
177 |
+
elif where == "PATH":
|
178 |
+
tags = find_pattern(file.name, include, exclude, remove)
|
179 |
+
else:
|
180 |
+
error_txt = where + 'does not exist'
|
181 |
+
|
182 |
+
for tag in tags:
|
183 |
+
tag_list.append([pname, tag, file.name])
|
184 |
+
|
185 |
+
return tag_list, error_txt
|
186 |
+
|
187 |
+
def file_info(file_list):
|
188 |
+
res = {"File":[] ,"Pages":[], "Wheres":[]}
|
189 |
+
files = os.dup(file_list)
|
190 |
+
for file in files:
|
191 |
+
doc = fitz.open(stream=file.read(), filetype='pdf')
|
192 |
+
res['File'].append(file.name)
|
193 |
+
res['Pages'].append(doc.page_count)
|
194 |
+
where_file = []
|
195 |
+
if len(doc.layer_ui_configs()) > 0:
|
196 |
+
where_file.append('LAYER')
|
197 |
+
if ''.join([page.get_text() for page in doc]) != '':
|
198 |
+
where_file.append('TEXT')
|
199 |
+
if len(doc.get_toc()) > 0:
|
200 |
+
where_file.append('BOOKMARK')
|
201 |
+
res['Wheres'].append(where_file)
|
202 |
+
doc.close()
|
203 |
+
|
204 |
+
return pd.DataFrame(res)
|
205 |
+
|
206 |
+
##################################### Define Streamlit interface ########################################
|
207 |
+
st.set_page_config(layout="wide")
|
208 |
+
st.markdown('## **PDF tag Extractor**')
|
209 |
+
st.markdown('**v2.40** (June 2023 / S. Jaumain)')
|
210 |
+
#st.markdown('###### by S. Jaumain')
|
211 |
+
|
212 |
+
|
213 |
+
tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])
|
214 |
+
|
215 |
+
##################################### TAB 1 ########################################
|
216 |
+
with tab1:
|
217 |
+
st.subheader('Choose your PDF file(s):')
|
218 |
+
placeholder = st.empty()
|
219 |
+
#placeholder2 = st.empty()
|
220 |
+
st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
|
221 |
+
# check existence of PDF files
|
222 |
+
if st.session_state.pdf_files:
|
223 |
+
placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='β
')
|
224 |
+
#with placeholder2.expander(':information_source: FILE INFO'):
|
225 |
+
# st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)
|
226 |
+
|
227 |
+
else:
|
228 |
+
placeholder.warning('No file selected yet.', icon='π’')
|
229 |
+
|
230 |
+
##################################### TAB 2 ########################################
|
231 |
+
patterns = [["Tags Instrument",
|
232 |
+
"BOOKMARK",
|
233 |
+
"instrument",
|
234 |
+
"[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
|
235 |
+
"(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)",
|
236 |
+
"",
|
237 |
+
]
|
238 |
+
]
|
239 |
+
|
240 |
+
st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
|
241 |
+
st.session_state.df_pattern.index.name = "Pattern #"
|
242 |
+
st.session_state.flag=False
|
243 |
+
|
244 |
+
help_lines = """
|
245 |
+
:blue[Name] give a string with the name/type to be displayed in the output list
|
246 |
+
|
247 |
+
:blue[Where] give a list [...] of strings with following options:
|
248 |
+
|
249 |
+
- ["TEXT"] = search in plain PDF text
|
250 |
+
|
251 |
+
- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.
|
252 |
+
|
253 |
+
- ["LAYER", <list>] = search in layers named in <list> as a list of strings
|
254 |
+
|
255 |
+
- ["PATH"] = search pattern in path name.
|
256 |
+
|
257 |
+
- ["TOC"] = search pattern in table of content.
|
258 |
+
|
259 |
+
:blue[Include] give a regex string for the patterns to include
|
260 |
+
|
261 |
+
:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2
|
262 |
+
|
263 |
+
:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
|
264 |
+
"""
|
265 |
+
warn_flag = True
|
266 |
+
where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
|
267 |
+
df_config = {
|
268 |
+
'Name':st.column_config.TextColumn('Name',
|
269 |
+
required=True
|
270 |
+
),
|
271 |
+
'Where': st.column_config.TextColumn('Where',
|
272 |
+
help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
|
273 |
+
default='TEXT',
|
274 |
+
required=True,
|
275 |
+
validate='|'.join(where_keywords)
|
276 |
+
),
|
277 |
+
'Labels': st.column_config.TextColumn('Labels',
|
278 |
+
help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
|
279 |
+
),
|
280 |
+
'Include':st.column_config.TextColumn('Include',
|
281 |
+
help='For examples of REGEXs please refer to https://regex101.com/',
|
282 |
+
required=True,
|
283 |
+
validate='\S'
|
284 |
+
),
|
285 |
+
'Exclude':st.column_config.TextColumn('Exclude',
|
286 |
+
help='For examples of REGEXs please refer to https://regex101.com/',
|
287 |
+
required=False,
|
288 |
+
default='',
|
289 |
+
#validate='\S'
|
290 |
+
)
|
291 |
+
}
|
292 |
+
|
293 |
+
with tab2:
|
294 |
+
if 'df_error' not in st.session_state:
|
295 |
+
st.session_state.df_error = False
|
296 |
+
st.header('REGEX dictionary')
|
297 |
+
with st.expander(':question: HELP'):
|
298 |
+
st.markdown(help_lines)
|
299 |
+
|
300 |
+
tab2_placehld = st.empty()
|
301 |
+
|
302 |
+
st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
|
303 |
+
column_config=df_config,
|
304 |
+
use_container_width=True,
|
305 |
+
num_rows='dynamic',
|
306 |
+
#disabled=['Check'],
|
307 |
+
key='TT')
|
308 |
+
|
309 |
+
if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
|
310 |
+
st.session_state.df_error = False
|
311 |
+
for i, row in st.session_state.df_pattern.iterrows():
|
312 |
+
|
313 |
+
if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
|
314 |
+
st.session_state.df_error = True
|
315 |
+
tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='π’')
|
316 |
+
|
317 |
+
try:
|
318 |
+
re.compile(row['Include'])
|
319 |
+
except re.error:
|
320 |
+
tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='π’')
|
321 |
+
st.session_state.df_error = True
|
322 |
+
|
323 |
+
if row['Exclude']==None:
|
324 |
+
st.session_state.df_pattern.loc[i,'Exclude']=''
|
325 |
+
else:
|
326 |
+
try:
|
327 |
+
re.compile(row['Exclude'])
|
328 |
+
except re.error:
|
329 |
+
tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='π’')
|
330 |
+
st.session_state.df_error = True
|
331 |
+
|
332 |
+
|
333 |
+
##################################### TAB 3 ########################################
|
334 |
+
patterns = st.session_state.df_pattern.values.tolist()
|
335 |
+
st.session_state.df = pd.DataFrame({})
|
336 |
+
with tab3:
|
337 |
+
col1, col2 = st.columns(2)
|
338 |
+
tab3_placehld = col1.empty()
|
339 |
+
filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
|
340 |
+
filename = filename.split('.')[0]
|
341 |
+
rm_duplicates = col1.checkbox('remove duplicates?', value=True)
|
342 |
+
one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
|
343 |
+
btn = col1.button('Extract tags')
|
344 |
+
tab3_placehld2 = col1.empty()
|
345 |
+
if btn and len(st.session_state.pdf_files) == 0:
|
346 |
+
tab3_placehld.warning('No files selected!', icon='β')
|
347 |
+
if btn and len(st.session_state.pdf_files) > 0 :
|
348 |
+
tag_list = []
|
349 |
+
error_list = []
|
350 |
+
progress_text = 'Extraction on-going'
|
351 |
+
progress_bar = col2.progress(0, text=progress_text)
|
352 |
+
for i, file in enumerate(st.session_state.pdf_files):
|
353 |
+
tag_ls, err_txt = extract_tag(file, patterns)
|
354 |
+
tag_list.extend(tag_ls)
|
355 |
+
error_list.extend(err_txt)
|
356 |
+
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
|
357 |
+
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
|
358 |
+
st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
|
359 |
+
if rm_duplicates:
|
360 |
+
st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
|
361 |
+
col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
|
362 |
+
col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
|
363 |
+
if st.session_state.df.shape[0] > 0:
|
364 |
+
buffer = BytesIO()
|
365 |
+
with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
|
366 |
+
if one_sheet:
|
367 |
+
st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
|
368 |
+
else:
|
369 |
+
for category in pd.unique(st.session_state.df['Tag type']):
|
370 |
+
st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
|
371 |
+
#excel.close()
|
372 |
+
col2.download_button('π₯ Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
|
373 |
+
else:
|
374 |
+
col1.warning(f'File empty! Not written.', icon='β')
|