Spaces:
Configuration error
Configuration error
#!python | |
# # Engineering PDF tag extractor | |
# by Serge Jaumain / SPIE Oil & Gas Services | |
# | |
# 31/05/2023 | |
# importing required modules | |
import re | |
import pandas as pd | |
import fitz | |
import streamlit as st | |
from io import BytesIO | |
def find_pattern(text, include, exclude, remove): | |
"""Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result | |
Args: | |
text (string): Text to be scanned | |
include (string): REGEX expression to extract patterns from text | |
exclude (string): REGEX expression to exclude patterns from search in text | |
remove (string): string to remove from result | |
Returns: | |
string: pattern filtered out | |
""" | |
if remove == None: | |
remove = '' | |
if include == None: | |
include = '' | |
find = re.findall(include, re.sub(remove, '', text)) | |
if not exclude: | |
filtered = find | |
else: | |
filtered = [el for el in find if re.findall(exclude,el)==[]] | |
clean = filtered | |
#if remove != []: | |
# for txt in remove: | |
# clean = [el.replace(txt, '') for el in clean] | |
return clean | |
def get_from_text(doc, include, exclude, remove): | |
"""Retrieves visible layer text from PDF | |
Args: | |
doc (fitz document): actual pdf document to extract | |
include (string): contains the regex string of tags to include | |
exclude (string): contains the regex string of tags to exclude | |
remove (string): contains a list of string patterns to remove at the end | |
Returns: | |
list: raw list of tags found | |
""" | |
# switch on all layers | |
doc_layers = doc.layer_ui_configs() | |
for doc_layer in doc_layers: | |
doc.set_layer_ui_config(doc_layer['number'], action=0) | |
text = '|'.join([page.get_text() for page in doc]) | |
return find_pattern(text, include, exclude, remove) | |
def get_from_toc(doc, include, exclude, remove): | |
"""Retrieves TOC from PDF | |
Args: | |
doc (fitz document): actual pdf document to extract | |
include (string): contains the regex string of tags to include | |
exclude (string): contains the regex string of tags to exclude | |
remove (string): contains a list of string patterns to remove at the end | |
Returns: | |
list: raw list of tags found | |
""" | |
text = doc.get_toc() | |
return find_pattern(text, include, exclude, remove) | |
def get_bookmark(doc, bm_text, include, exclude, remove): | |
"""Retrieves the bookmarks from PDF | |
Args: | |
doc (fitz document): actual pdf document to extract | |
bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive) | |
include (string): contains the regex string of tags to include | |
exclude (_type_): contains the regex string of tags to exclude | |
remove (string): contains a list of string patterns to remove at the end | |
Returns: | |
list: list like [tag1, tag2, ...] | |
""" | |
items = doc.get_toc() | |
tags = [] | |
flag = False | |
for item in items: | |
if bm_text == '$': | |
clean = find_pattern(item[1], include, exclude, remove) | |
tags.extend(clean) | |
else: | |
if item[0] == 1: | |
flag = bm_text.upper() in item[1].upper() | |
else: | |
if flag: | |
clean = find_pattern(item[1], include, exclude, remove) | |
tags.extend(clean) | |
return tags | |
def get_layer(doc, layer2search, include, exclude, remove): | |
"""Retrieves visible layer text from PDF | |
Args: | |
doc (fitz document): actual pdf document to extract | |
layern (string): contains the layer name of the layer to be extracted | |
include (string): contains the regex string of tags to include | |
exclude (string): contains the regex string of tags to exclude | |
remove (string): contains a list of string patterns to remove at the end | |
Returns: | |
list: raw list of tags found | |
""" | |
doc_layers = doc.layer_ui_configs() | |
# swith on all layers if "$" is found somewhere | |
# else switch off all layers not wanted | |
for layersearched in layer2search: | |
if layersearched.strip()[0] == "$": | |
for layer in doc_layers: | |
doc.set_layer_ui_config(layer['number'], action=0) | |
break | |
else: | |
for layer in doc_layers: | |
if layer['text'] in layersearched.strip(): | |
doc.set_layer_ui_config(layer['number'], action=0) | |
else: | |
doc.set_layer_ui_config(layer['number'], action=2) | |
# get all pages | |
text = '|'.join([page.get_text() for page in doc]) | |
return find_pattern(text, include, exclude, remove) | |
def extract_tag(file, patterns): | |
"""Extracts pattern list <patterns> from <file> | |
Args: | |
file (file object): PDF file object to be extracted | |
patterns (list): dictionnary of patterns | |
Returns: | |
list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...] | |
""" | |
# creating a pdf reader object | |
doc = fitz.open(stream=file.read(), filetype='pdf') | |
# go through all patterns to be detected | |
tag_list = [] | |
for pattern in patterns: | |
pname = pattern[0].strip() | |
where = pattern[1].strip().upper() | |
label = pattern[2].strip() | |
include = pattern[3] | |
exclude = pattern[4] | |
remove = pattern[5] | |
error_txt = '' | |
if where == "TEXT": | |
tags = get_from_text(doc, include, exclude, remove) | |
elif where == "TOC": | |
tags = get_from_toc(doc, include, exclude, remove) | |
elif where == "BOOKMARK": | |
tags = get_bookmark(doc, label, include, exclude, remove) | |
elif where == "LAYER": | |
tags = get_layer(doc, label, include, exclude, remove) | |
#if len(label) == 1: | |
# tags = get_layer(doc, [], include, exclude, remove) | |
#else: | |
# tags = [] | |
# for layer in label: | |
# tags.append(get_layer(doc, layer, include, exclude, remove)) | |
elif where == "PATH": | |
tags = find_pattern(file.name, include, exclude, remove) | |
else: | |
error_txt = where + 'does not exist' | |
for tag in tags: | |
tag_list.append([pname, tag, file.name]) | |
return tag_list, error_txt | |
def file_info(file_list): | |
res = {"File":[] ,"Pages":[], "Wheres":[]} | |
files = os.dup(file_list) | |
for file in files: | |
doc = fitz.open(stream=file.read(), filetype='pdf') | |
res['File'].append(file.name) | |
res['Pages'].append(doc.page_count) | |
where_file = [] | |
if len(doc.layer_ui_configs()) > 0: | |
where_file.append('LAYER') | |
if ''.join([page.get_text() for page in doc]) != '': | |
where_file.append('TEXT') | |
if len(doc.get_toc()) > 0: | |
where_file.append('BOOKMARK') | |
res['Wheres'].append(where_file) | |
doc.close() | |
return pd.DataFrame(res) | |
##################################### Define Streamlit interface ######################################## | |
st.set_page_config(layout="wide") | |
st.markdown('## **PDF tag Extractor**') | |
st.markdown('**v2.40** (June 2023 / S. Jaumain)') | |
#st.markdown('###### by S. Jaumain') | |
tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result']) | |
##################################### TAB 1 ######################################## | |
with tab1: | |
st.subheader('Choose your PDF file(s):') | |
placeholder = st.empty() | |
#placeholder2 = st.empty() | |
st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True) | |
# check existence of PDF files | |
if st.session_state.pdf_files: | |
placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='β ') | |
#with placeholder2.expander(':information_source: FILE INFO'): | |
# st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True) | |
else: | |
placeholder.warning('No file selected yet.', icon='π’') | |
##################################### TAB 2 ######################################## | |
patterns = [["Tags Instrument", | |
"BOOKMARK", | |
"instrument", | |
"[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}", | |
"(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)", | |
"", | |
] | |
] | |
st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove']) | |
st.session_state.df_pattern.index.name = "Pattern #" | |
st.session_state.flag=False | |
help_lines = """ | |
:blue[Name] give a string with the name/type to be displayed in the output list | |
:blue[Where] give a list [...] of strings with following options: | |
- ["TEXT"] = search in plain PDF text | |
- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all. | |
- ["LAYER", <list>] = search in layers named in <list> as a list of strings | |
- ["PATH"] = search pattern in path name. | |
- ["TOC"] = search pattern in table of content. | |
:blue[Include] give a regex string for the patterns to include | |
:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2 | |
:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1 | |
""" | |
warn_flag = True | |
where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC'] | |
df_config = { | |
'Name':st.column_config.TextColumn('Name', | |
required=True | |
), | |
'Where': st.column_config.TextColumn('Where', | |
help='Indicate where to search. Can be '+', '.join(where_keywords)+'.', | |
default='TEXT', | |
required=True, | |
validate='|'.join(where_keywords) | |
), | |
'Labels': st.column_config.TextColumn('Labels', | |
help='Indicate the label of Bookmark or Layer to search in. For all use "$".', | |
), | |
'Include':st.column_config.TextColumn('Include', | |
help='For examples of REGEXs please refer to https://regex101.com/', | |
required=True, | |
validate='\S' | |
), | |
'Exclude':st.column_config.TextColumn('Exclude', | |
help='For examples of REGEXs please refer to https://regex101.com/', | |
required=False, | |
default='', | |
#validate='\S' | |
) | |
} | |
with tab2: | |
if 'df_error' not in st.session_state: | |
st.session_state.df_error = False | |
st.header('REGEX dictionary') | |
with st.expander(':question: HELP'): | |
st.markdown(help_lines) | |
tab2_placehld = st.empty() | |
st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern, | |
column_config=df_config, | |
use_container_width=True, | |
num_rows='dynamic', | |
#disabled=['Check'], | |
key='TT') | |
if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}: | |
st.session_state.df_error = False | |
for i, row in st.session_state.df_pattern.iterrows(): | |
if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='': | |
st.session_state.df_error = True | |
tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='π’') | |
try: | |
re.compile(row['Include']) | |
except re.error: | |
tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='π’') | |
st.session_state.df_error = True | |
if row['Exclude']==None: | |
st.session_state.df_pattern.loc[i,'Exclude']='' | |
else: | |
try: | |
re.compile(row['Exclude']) | |
except re.error: | |
tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='π’') | |
st.session_state.df_error = True | |
##################################### TAB 3 ######################################## | |
patterns = st.session_state.df_pattern.values.tolist() | |
st.session_state.df = pd.DataFrame({}) | |
with tab3: | |
col1, col2 = st.columns(2) | |
tab3_placehld = col1.empty() | |
filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx') | |
filename = filename.split('.')[0] | |
rm_duplicates = col1.checkbox('remove duplicates?', value=True) | |
one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True) | |
btn = col1.button('Extract tags') | |
tab3_placehld2 = col1.empty() | |
if btn and len(st.session_state.pdf_files) == 0: | |
tab3_placehld.warning('No files selected!', icon='β') | |
if btn and len(st.session_state.pdf_files) > 0 : | |
tag_list = [] | |
error_list = [] | |
progress_text = 'Extraction on-going' | |
progress_bar = col2.progress(0, text=progress_text) | |
for i, file in enumerate(st.session_state.pdf_files): | |
tag_ls, err_txt = extract_tag(file, patterns) | |
tag_list.extend(tag_ls) | |
error_list.extend(err_txt) | |
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text) | |
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed") | |
st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file']) | |
if rm_duplicates: | |
st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file']) | |
col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}') | |
col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True) | |
if st.session_state.df.shape[0] > 0: | |
buffer = BytesIO() | |
with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel: | |
if one_sheet: | |
st.session_state.df.to_excel(excel, sheet_name='tags', index=False) | |
else: | |
for category in pd.unique(st.session_state.df['Tag type']): | |
st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False) | |
#excel.close() | |
col2.download_button('π₯ Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel') | |
else: | |
col1.warning(f'File empty! Not written.', icon='β') |