Spaces:
Configuration error
Configuration error
File size: 15,364 Bytes
aa0784d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 |
#!python
# # Engineering PDF tag extractor
# by Serge Jaumain / SPIE Oil & Gas Services
#
# 31/05/2023
# importing required modules
import re
import pandas as pd
import fitz
import streamlit as st
from io import BytesIO
def find_pattern(text, include, exclude, remove):
"""Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result
Args:
text (string): Text to be scanned
include (string): REGEX expression to extract patterns from text
exclude (string): REGEX expression to exclude patterns from search in text
remove (string): string to remove from result
Returns:
string: pattern filtered out
"""
if remove == None:
remove = ''
if include == None:
include = ''
find = re.findall(include, re.sub(remove, '', text))
if not exclude:
filtered = find
else:
filtered = [el for el in find if re.findall(exclude,el)==[]]
clean = filtered
#if remove != []:
# for txt in remove:
# clean = [el.replace(txt, '') for el in clean]
return clean
def get_from_text(doc, include, exclude, remove):
"""Retrieves visible layer text from PDF
Args:
doc (fitz document): actual pdf document to extract
include (string): contains the regex string of tags to include
exclude (string): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: raw list of tags found
"""
# switch on all layers
doc_layers = doc.layer_ui_configs()
for doc_layer in doc_layers:
doc.set_layer_ui_config(doc_layer['number'], action=0)
text = '|'.join([page.get_text() for page in doc])
return find_pattern(text, include, exclude, remove)
def get_from_toc(doc, include, exclude, remove):
"""Retrieves TOC from PDF
Args:
doc (fitz document): actual pdf document to extract
include (string): contains the regex string of tags to include
exclude (string): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: raw list of tags found
"""
text = doc.get_toc()
return find_pattern(text, include, exclude, remove)
def get_bookmark(doc, bm_text, include, exclude, remove):
"""Retrieves the bookmarks from PDF
Args:
doc (fitz document): actual pdf document to extract
bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
include (string): contains the regex string of tags to include
exclude (_type_): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: list like [tag1, tag2, ...]
"""
items = doc.get_toc()
tags = []
flag = False
for item in items:
if bm_text == '$':
clean = find_pattern(item[1], include, exclude, remove)
tags.extend(clean)
else:
if item[0] == 1:
flag = bm_text.upper() in item[1].upper()
else:
if flag:
clean = find_pattern(item[1], include, exclude, remove)
tags.extend(clean)
return tags
def get_layer(doc, layer2search, include, exclude, remove):
"""Retrieves visible layer text from PDF
Args:
doc (fitz document): actual pdf document to extract
layern (string): contains the layer name of the layer to be extracted
include (string): contains the regex string of tags to include
exclude (string): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: raw list of tags found
"""
doc_layers = doc.layer_ui_configs()
# swith on all layers if "$" is found somewhere
# else switch off all layers not wanted
for layersearched in layer2search:
if layersearched.strip()[0] == "$":
for layer in doc_layers:
doc.set_layer_ui_config(layer['number'], action=0)
break
else:
for layer in doc_layers:
if layer['text'] in layersearched.strip():
doc.set_layer_ui_config(layer['number'], action=0)
else:
doc.set_layer_ui_config(layer['number'], action=2)
# get all pages
text = '|'.join([page.get_text() for page in doc])
return find_pattern(text, include, exclude, remove)
def extract_tag(file, patterns):
"""Extracts pattern list <patterns> from <file>
Args:
file (file object): PDF file object to be extracted
patterns (list): dictionnary of patterns
Returns:
list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
"""
# creating a pdf reader object
doc = fitz.open(stream=file.read(), filetype='pdf')
# go through all patterns to be detected
tag_list = []
for pattern in patterns:
pname = pattern[0].strip()
where = pattern[1].strip().upper()
label = pattern[2].strip()
include = pattern[3]
exclude = pattern[4]
remove = pattern[5]
error_txt = ''
if where == "TEXT":
tags = get_from_text(doc, include, exclude, remove)
elif where == "TOC":
tags = get_from_toc(doc, include, exclude, remove)
elif where == "BOOKMARK":
tags = get_bookmark(doc, label, include, exclude, remove)
elif where == "LAYER":
tags = get_layer(doc, label, include, exclude, remove)
#if len(label) == 1:
# tags = get_layer(doc, [], include, exclude, remove)
#else:
# tags = []
# for layer in label:
# tags.append(get_layer(doc, layer, include, exclude, remove))
elif where == "PATH":
tags = find_pattern(file.name, include, exclude, remove)
else:
error_txt = where + 'does not exist'
for tag in tags:
tag_list.append([pname, tag, file.name])
return tag_list, error_txt
def file_info(file_list):
res = {"File":[] ,"Pages":[], "Wheres":[]}
files = os.dup(file_list)
for file in files:
doc = fitz.open(stream=file.read(), filetype='pdf')
res['File'].append(file.name)
res['Pages'].append(doc.page_count)
where_file = []
if len(doc.layer_ui_configs()) > 0:
where_file.append('LAYER')
if ''.join([page.get_text() for page in doc]) != '':
where_file.append('TEXT')
if len(doc.get_toc()) > 0:
where_file.append('BOOKMARK')
res['Wheres'].append(where_file)
doc.close()
return pd.DataFrame(res)
##################################### Define Streamlit interface ########################################
st.set_page_config(layout="wide")
st.markdown('## **PDF tag Extractor**')
st.markdown('**v2.40** (June 2023 / S. Jaumain)')
#st.markdown('###### by S. Jaumain')
tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])
##################################### TAB 1 ########################################
with tab1:
st.subheader('Choose your PDF file(s):')
placeholder = st.empty()
#placeholder2 = st.empty()
st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
# check existence of PDF files
if st.session_state.pdf_files:
placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='β
')
#with placeholder2.expander(':information_source: FILE INFO'):
# st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)
else:
placeholder.warning('No file selected yet.', icon='π’')
##################################### TAB 2 ########################################
patterns = [["Tags Instrument",
"BOOKMARK",
"instrument",
"[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
"(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)",
"",
]
]
st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
st.session_state.df_pattern.index.name = "Pattern #"
st.session_state.flag=False
help_lines = """
:blue[Name] give a string with the name/type to be displayed in the output list
:blue[Where] give a list [...] of strings with following options:
- ["TEXT"] = search in plain PDF text
- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.
- ["LAYER", <list>] = search in layers named in <list> as a list of strings
- ["PATH"] = search pattern in path name.
- ["TOC"] = search pattern in table of content.
:blue[Include] give a regex string for the patterns to include
:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2
:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
"""
warn_flag = True
where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
df_config = {
'Name':st.column_config.TextColumn('Name',
required=True
),
'Where': st.column_config.TextColumn('Where',
help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
default='TEXT',
required=True,
validate='|'.join(where_keywords)
),
'Labels': st.column_config.TextColumn('Labels',
help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
),
'Include':st.column_config.TextColumn('Include',
help='For examples of REGEXs please refer to https://regex101.com/',
required=True,
validate='\S'
),
'Exclude':st.column_config.TextColumn('Exclude',
help='For examples of REGEXs please refer to https://regex101.com/',
required=False,
default='',
#validate='\S'
)
}
with tab2:
if 'df_error' not in st.session_state:
st.session_state.df_error = False
st.header('REGEX dictionary')
with st.expander(':question: HELP'):
st.markdown(help_lines)
tab2_placehld = st.empty()
st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
column_config=df_config,
use_container_width=True,
num_rows='dynamic',
#disabled=['Check'],
key='TT')
if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
st.session_state.df_error = False
for i, row in st.session_state.df_pattern.iterrows():
if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
st.session_state.df_error = True
tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='π’')
try:
re.compile(row['Include'])
except re.error:
tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='π’')
st.session_state.df_error = True
if row['Exclude']==None:
st.session_state.df_pattern.loc[i,'Exclude']=''
else:
try:
re.compile(row['Exclude'])
except re.error:
tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='π’')
st.session_state.df_error = True
##################################### TAB 3 ########################################
patterns = st.session_state.df_pattern.values.tolist()
st.session_state.df = pd.DataFrame({})
with tab3:
col1, col2 = st.columns(2)
tab3_placehld = col1.empty()
filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
filename = filename.split('.')[0]
rm_duplicates = col1.checkbox('remove duplicates?', value=True)
one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
btn = col1.button('Extract tags')
tab3_placehld2 = col1.empty()
if btn and len(st.session_state.pdf_files) == 0:
tab3_placehld.warning('No files selected!', icon='β')
if btn and len(st.session_state.pdf_files) > 0 :
tag_list = []
error_list = []
progress_text = 'Extraction on-going'
progress_bar = col2.progress(0, text=progress_text)
for i, file in enumerate(st.session_state.pdf_files):
tag_ls, err_txt = extract_tag(file, patterns)
tag_list.extend(tag_ls)
error_list.extend(err_txt)
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
if rm_duplicates:
st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
if st.session_state.df.shape[0] > 0:
buffer = BytesIO()
with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
if one_sheet:
st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
else:
for category in pd.unique(st.session_state.df['Tag type']):
st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
#excel.close()
col2.download_button('π₯ Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
else:
col1.warning(f'File empty! Not written.', icon='β') |