jase64 commited on
Commit
aa0784d
β€’
1 Parent(s): 82452ac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +374 -0
app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!python
2
+ # # Engineering PDF tag extractor
3
+ # by Serge Jaumain / SPIE Oil & Gas Services
4
+ #
5
+ # 31/05/2023
6
+
7
+ # importing required modules
8
+ import re
9
+ import pandas as pd
10
+ import fitz
11
+ import streamlit as st
12
+ from io import BytesIO
13
+
14
+
15
+ def find_pattern(text, include, exclude, remove):
16
+ """Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result
17
+
18
+ Args:
19
+ text (string): Text to be scanned
20
+ include (string): REGEX expression to extract patterns from text
21
+ exclude (string): REGEX expression to exclude patterns from search in text
22
+ remove (string): string to remove from result
23
+
24
+ Returns:
25
+ string: pattern filtered out
26
+ """
27
+ if remove == None:
28
+ remove = ''
29
+ if include == None:
30
+ include = ''
31
+ find = re.findall(include, re.sub(remove, '', text))
32
+ if not exclude:
33
+ filtered = find
34
+ else:
35
+ filtered = [el for el in find if re.findall(exclude,el)==[]]
36
+ clean = filtered
37
+ #if remove != []:
38
+ # for txt in remove:
39
+ # clean = [el.replace(txt, '') for el in clean]
40
+ return clean
41
+
42
+ def get_from_text(doc, include, exclude, remove):
43
+ """Retrieves visible layer text from PDF
44
+
45
+ Args:
46
+ doc (fitz document): actual pdf document to extract
47
+ include (string): contains the regex string of tags to include
48
+ exclude (string): contains the regex string of tags to exclude
49
+ remove (string): contains a list of string patterns to remove at the end
50
+
51
+ Returns:
52
+ list: raw list of tags found
53
+ """
54
+ # switch on all layers
55
+ doc_layers = doc.layer_ui_configs()
56
+ for doc_layer in doc_layers:
57
+ doc.set_layer_ui_config(doc_layer['number'], action=0)
58
+
59
+ text = '|'.join([page.get_text() for page in doc])
60
+
61
+ return find_pattern(text, include, exclude, remove)
62
+
63
+ def get_from_toc(doc, include, exclude, remove):
64
+ """Retrieves TOC from PDF
65
+
66
+ Args:
67
+ doc (fitz document): actual pdf document to extract
68
+ include (string): contains the regex string of tags to include
69
+ exclude (string): contains the regex string of tags to exclude
70
+ remove (string): contains a list of string patterns to remove at the end
71
+
72
+ Returns:
73
+ list: raw list of tags found
74
+ """
75
+ text = doc.get_toc()
76
+
77
+ return find_pattern(text, include, exclude, remove)
78
+
79
+ def get_bookmark(doc, bm_text, include, exclude, remove):
80
+ """Retrieves the bookmarks from PDF
81
+
82
+ Args:
83
+ doc (fitz document): actual pdf document to extract
84
+ bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
85
+ include (string): contains the regex string of tags to include
86
+ exclude (_type_): contains the regex string of tags to exclude
87
+ remove (string): contains a list of string patterns to remove at the end
88
+
89
+ Returns:
90
+ list: list like [tag1, tag2, ...]
91
+ """
92
+ items = doc.get_toc()
93
+ tags = []
94
+ flag = False
95
+ for item in items:
96
+ if bm_text == '$':
97
+ clean = find_pattern(item[1], include, exclude, remove)
98
+ tags.extend(clean)
99
+ else:
100
+ if item[0] == 1:
101
+ flag = bm_text.upper() in item[1].upper()
102
+ else:
103
+ if flag:
104
+ clean = find_pattern(item[1], include, exclude, remove)
105
+ tags.extend(clean)
106
+ return tags
107
+
108
+ def get_layer(doc, layer2search, include, exclude, remove):
109
+ """Retrieves visible layer text from PDF
110
+
111
+ Args:
112
+ doc (fitz document): actual pdf document to extract
113
+ layern (string): contains the layer name of the layer to be extracted
114
+ include (string): contains the regex string of tags to include
115
+ exclude (string): contains the regex string of tags to exclude
116
+ remove (string): contains a list of string patterns to remove at the end
117
+
118
+ Returns:
119
+ list: raw list of tags found
120
+ """
121
+ doc_layers = doc.layer_ui_configs()
122
+ # swith on all layers if "$" is found somewhere
123
+ # else switch off all layers not wanted
124
+ for layersearched in layer2search:
125
+ if layersearched.strip()[0] == "$":
126
+ for layer in doc_layers:
127
+ doc.set_layer_ui_config(layer['number'], action=0)
128
+ break
129
+ else:
130
+ for layer in doc_layers:
131
+ if layer['text'] in layersearched.strip():
132
+ doc.set_layer_ui_config(layer['number'], action=0)
133
+ else:
134
+ doc.set_layer_ui_config(layer['number'], action=2)
135
+
136
+ # get all pages
137
+ text = '|'.join([page.get_text() for page in doc])
138
+
139
+ return find_pattern(text, include, exclude, remove)
140
+
141
+ def extract_tag(file, patterns):
142
+ """Extracts pattern list <patterns> from <file>
143
+
144
+ Args:
145
+ file (file object): PDF file object to be extracted
146
+ patterns (list): dictionnary of patterns
147
+
148
+ Returns:
149
+ list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
150
+ """
151
+ # creating a pdf reader object
152
+ doc = fitz.open(stream=file.read(), filetype='pdf')
153
+ # go through all patterns to be detected
154
+ tag_list = []
155
+ for pattern in patterns:
156
+ pname = pattern[0].strip()
157
+ where = pattern[1].strip().upper()
158
+ label = pattern[2].strip()
159
+ include = pattern[3]
160
+ exclude = pattern[4]
161
+ remove = pattern[5]
162
+ error_txt = ''
163
+ if where == "TEXT":
164
+ tags = get_from_text(doc, include, exclude, remove)
165
+ elif where == "TOC":
166
+ tags = get_from_toc(doc, include, exclude, remove)
167
+ elif where == "BOOKMARK":
168
+ tags = get_bookmark(doc, label, include, exclude, remove)
169
+ elif where == "LAYER":
170
+ tags = get_layer(doc, label, include, exclude, remove)
171
+ #if len(label) == 1:
172
+ # tags = get_layer(doc, [], include, exclude, remove)
173
+ #else:
174
+ # tags = []
175
+ # for layer in label:
176
+ # tags.append(get_layer(doc, layer, include, exclude, remove))
177
+ elif where == "PATH":
178
+ tags = find_pattern(file.name, include, exclude, remove)
179
+ else:
180
+ error_txt = where + 'does not exist'
181
+
182
+ for tag in tags:
183
+ tag_list.append([pname, tag, file.name])
184
+
185
+ return tag_list, error_txt
186
+
187
+ def file_info(file_list):
188
+ res = {"File":[] ,"Pages":[], "Wheres":[]}
189
+ files = os.dup(file_list)
190
+ for file in files:
191
+ doc = fitz.open(stream=file.read(), filetype='pdf')
192
+ res['File'].append(file.name)
193
+ res['Pages'].append(doc.page_count)
194
+ where_file = []
195
+ if len(doc.layer_ui_configs()) > 0:
196
+ where_file.append('LAYER')
197
+ if ''.join([page.get_text() for page in doc]) != '':
198
+ where_file.append('TEXT')
199
+ if len(doc.get_toc()) > 0:
200
+ where_file.append('BOOKMARK')
201
+ res['Wheres'].append(where_file)
202
+ doc.close()
203
+
204
+ return pd.DataFrame(res)
205
+
206
+ ##################################### Define Streamlit interface ########################################
207
+ st.set_page_config(layout="wide")
208
+ st.markdown('## **PDF tag Extractor**')
209
+ st.markdown('**v2.40** (June 2023 / S. Jaumain)')
210
+ #st.markdown('###### by S. Jaumain')
211
+
212
+
213
+ tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])
214
+
215
+ ##################################### TAB 1 ########################################
216
+ with tab1:
217
+ st.subheader('Choose your PDF file(s):')
218
+ placeholder = st.empty()
219
+ #placeholder2 = st.empty()
220
+ st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
221
+ # check existence of PDF files
222
+ if st.session_state.pdf_files:
223
+ placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='βœ…')
224
+ #with placeholder2.expander(':information_source: FILE INFO'):
225
+ # st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)
226
+
227
+ else:
228
+ placeholder.warning('No file selected yet.', icon='πŸ“’')
229
+
230
+ ##################################### TAB 2 ########################################
231
+ patterns = [["Tags Instrument",
232
+ "BOOKMARK",
233
+ "instrument",
234
+ "[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
235
+ "(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)",
236
+ "",
237
+ ]
238
+ ]
239
+
240
+ st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
241
+ st.session_state.df_pattern.index.name = "Pattern #"
242
+ st.session_state.flag=False
243
+
244
+ help_lines = """
245
+ :blue[Name] give a string with the name/type to be displayed in the output list
246
+
247
+ :blue[Where] give a list [...] of strings with following options:
248
+
249
+ - ["TEXT"] = search in plain PDF text
250
+
251
+ - ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.
252
+
253
+ - ["LAYER", <list>] = search in layers named in <list> as a list of strings
254
+
255
+ - ["PATH"] = search pattern in path name.
256
+
257
+ - ["TOC"] = search pattern in table of content.
258
+
259
+ :blue[Include] give a regex string for the patterns to include
260
+
261
+ :blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2
262
+
263
+ :blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
264
+ """
265
+ warn_flag = True
266
+ where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
267
+ df_config = {
268
+ 'Name':st.column_config.TextColumn('Name',
269
+ required=True
270
+ ),
271
+ 'Where': st.column_config.TextColumn('Where',
272
+ help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
273
+ default='TEXT',
274
+ required=True,
275
+ validate='|'.join(where_keywords)
276
+ ),
277
+ 'Labels': st.column_config.TextColumn('Labels',
278
+ help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
279
+ ),
280
+ 'Include':st.column_config.TextColumn('Include',
281
+ help='For examples of REGEXs please refer to https://regex101.com/',
282
+ required=True,
283
+ validate='\S'
284
+ ),
285
+ 'Exclude':st.column_config.TextColumn('Exclude',
286
+ help='For examples of REGEXs please refer to https://regex101.com/',
287
+ required=False,
288
+ default='',
289
+ #validate='\S'
290
+ )
291
+ }
292
+
293
+ with tab2:
294
+ if 'df_error' not in st.session_state:
295
+ st.session_state.df_error = False
296
+ st.header('REGEX dictionary')
297
+ with st.expander(':question: HELP'):
298
+ st.markdown(help_lines)
299
+
300
+ tab2_placehld = st.empty()
301
+
302
+ st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
303
+ column_config=df_config,
304
+ use_container_width=True,
305
+ num_rows='dynamic',
306
+ #disabled=['Check'],
307
+ key='TT')
308
+
309
+ if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
310
+ st.session_state.df_error = False
311
+ for i, row in st.session_state.df_pattern.iterrows():
312
+
313
+ if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
314
+ st.session_state.df_error = True
315
+ tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='πŸ“’')
316
+
317
+ try:
318
+ re.compile(row['Include'])
319
+ except re.error:
320
+ tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='πŸ“’')
321
+ st.session_state.df_error = True
322
+
323
+ if row['Exclude']==None:
324
+ st.session_state.df_pattern.loc[i,'Exclude']=''
325
+ else:
326
+ try:
327
+ re.compile(row['Exclude'])
328
+ except re.error:
329
+ tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='πŸ“’')
330
+ st.session_state.df_error = True
331
+
332
+
333
+ ##################################### TAB 3 ########################################
334
+ patterns = st.session_state.df_pattern.values.tolist()
335
+ st.session_state.df = pd.DataFrame({})
336
+ with tab3:
337
+ col1, col2 = st.columns(2)
338
+ tab3_placehld = col1.empty()
339
+ filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
340
+ filename = filename.split('.')[0]
341
+ rm_duplicates = col1.checkbox('remove duplicates?', value=True)
342
+ one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
343
+ btn = col1.button('Extract tags')
344
+ tab3_placehld2 = col1.empty()
345
+ if btn and len(st.session_state.pdf_files) == 0:
346
+ tab3_placehld.warning('No files selected!', icon='β›”')
347
+ if btn and len(st.session_state.pdf_files) > 0 :
348
+ tag_list = []
349
+ error_list = []
350
+ progress_text = 'Extraction on-going'
351
+ progress_bar = col2.progress(0, text=progress_text)
352
+ for i, file in enumerate(st.session_state.pdf_files):
353
+ tag_ls, err_txt = extract_tag(file, patterns)
354
+ tag_list.extend(tag_ls)
355
+ error_list.extend(err_txt)
356
+ progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
357
+ progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
358
+ st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
359
+ if rm_duplicates:
360
+ st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
361
+ col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
362
+ col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
363
+ if st.session_state.df.shape[0] > 0:
364
+ buffer = BytesIO()
365
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
366
+ if one_sheet:
367
+ st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
368
+ else:
369
+ for category in pd.unique(st.session_state.df['Tag type']):
370
+ st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
371
+ #excel.close()
372
+ col2.download_button('πŸ“₯ Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
373
+ else:
374
+ col1.warning(f'File empty! Not written.', icon='❌')