pdf-extractor / app.py
jase64's picture
Create app.py
aa0784d
#!python
# # Engineering PDF tag extractor
# by Serge Jaumain / SPIE Oil & Gas Services
#
# 31/05/2023
# importing required modules
import re
import pandas as pd
import fitz
import streamlit as st
from io import BytesIO
def find_pattern(text, include, exclude, remove):
"""Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result
Args:
text (string): Text to be scanned
include (string): REGEX expression to extract patterns from text
exclude (string): REGEX expression to exclude patterns from search in text
remove (string): string to remove from result
Returns:
string: pattern filtered out
"""
if remove == None:
remove = ''
if include == None:
include = ''
find = re.findall(include, re.sub(remove, '', text))
if not exclude:
filtered = find
else:
filtered = [el for el in find if re.findall(exclude,el)==[]]
clean = filtered
#if remove != []:
# for txt in remove:
# clean = [el.replace(txt, '') for el in clean]
return clean
def get_from_text(doc, include, exclude, remove):
"""Retrieves visible layer text from PDF
Args:
doc (fitz document): actual pdf document to extract
include (string): contains the regex string of tags to include
exclude (string): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: raw list of tags found
"""
# switch on all layers
doc_layers = doc.layer_ui_configs()
for doc_layer in doc_layers:
doc.set_layer_ui_config(doc_layer['number'], action=0)
text = '|'.join([page.get_text() for page in doc])
return find_pattern(text, include, exclude, remove)
def get_from_toc(doc, include, exclude, remove):
"""Retrieves TOC from PDF
Args:
doc (fitz document): actual pdf document to extract
include (string): contains the regex string of tags to include
exclude (string): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: raw list of tags found
"""
text = doc.get_toc()
return find_pattern(text, include, exclude, remove)
def get_bookmark(doc, bm_text, include, exclude, remove):
"""Retrieves the bookmarks from PDF
Args:
doc (fitz document): actual pdf document to extract
bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
include (string): contains the regex string of tags to include
exclude (_type_): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: list like [tag1, tag2, ...]
"""
items = doc.get_toc()
tags = []
flag = False
for item in items:
if bm_text == '$':
clean = find_pattern(item[1], include, exclude, remove)
tags.extend(clean)
else:
if item[0] == 1:
flag = bm_text.upper() in item[1].upper()
else:
if flag:
clean = find_pattern(item[1], include, exclude, remove)
tags.extend(clean)
return tags
def get_layer(doc, layer2search, include, exclude, remove):
"""Retrieves visible layer text from PDF
Args:
doc (fitz document): actual pdf document to extract
layern (string): contains the layer name of the layer to be extracted
include (string): contains the regex string of tags to include
exclude (string): contains the regex string of tags to exclude
remove (string): contains a list of string patterns to remove at the end
Returns:
list: raw list of tags found
"""
doc_layers = doc.layer_ui_configs()
# swith on all layers if "$" is found somewhere
# else switch off all layers not wanted
for layersearched in layer2search:
if layersearched.strip()[0] == "$":
for layer in doc_layers:
doc.set_layer_ui_config(layer['number'], action=0)
break
else:
for layer in doc_layers:
if layer['text'] in layersearched.strip():
doc.set_layer_ui_config(layer['number'], action=0)
else:
doc.set_layer_ui_config(layer['number'], action=2)
# get all pages
text = '|'.join([page.get_text() for page in doc])
return find_pattern(text, include, exclude, remove)
def extract_tag(file, patterns):
"""Extracts pattern list <patterns> from <file>
Args:
file (file object): PDF file object to be extracted
patterns (list): dictionnary of patterns
Returns:
list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
"""
# creating a pdf reader object
doc = fitz.open(stream=file.read(), filetype='pdf')
# go through all patterns to be detected
tag_list = []
for pattern in patterns:
pname = pattern[0].strip()
where = pattern[1].strip().upper()
label = pattern[2].strip()
include = pattern[3]
exclude = pattern[4]
remove = pattern[5]
error_txt = ''
if where == "TEXT":
tags = get_from_text(doc, include, exclude, remove)
elif where == "TOC":
tags = get_from_toc(doc, include, exclude, remove)
elif where == "BOOKMARK":
tags = get_bookmark(doc, label, include, exclude, remove)
elif where == "LAYER":
tags = get_layer(doc, label, include, exclude, remove)
#if len(label) == 1:
# tags = get_layer(doc, [], include, exclude, remove)
#else:
# tags = []
# for layer in label:
# tags.append(get_layer(doc, layer, include, exclude, remove))
elif where == "PATH":
tags = find_pattern(file.name, include, exclude, remove)
else:
error_txt = where + 'does not exist'
for tag in tags:
tag_list.append([pname, tag, file.name])
return tag_list, error_txt
def file_info(file_list):
res = {"File":[] ,"Pages":[], "Wheres":[]}
files = os.dup(file_list)
for file in files:
doc = fitz.open(stream=file.read(), filetype='pdf')
res['File'].append(file.name)
res['Pages'].append(doc.page_count)
where_file = []
if len(doc.layer_ui_configs()) > 0:
where_file.append('LAYER')
if ''.join([page.get_text() for page in doc]) != '':
where_file.append('TEXT')
if len(doc.get_toc()) > 0:
where_file.append('BOOKMARK')
res['Wheres'].append(where_file)
doc.close()
return pd.DataFrame(res)
##################################### Define Streamlit interface ########################################
st.set_page_config(layout="wide")
st.markdown('## **PDF tag Extractor**')
st.markdown('**v2.40** (June 2023 / S. Jaumain)')
#st.markdown('###### by S. Jaumain')
tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])
##################################### TAB 1 ########################################
with tab1:
st.subheader('Choose your PDF file(s):')
placeholder = st.empty()
#placeholder2 = st.empty()
st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
# check existence of PDF files
if st.session_state.pdf_files:
placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='βœ…')
#with placeholder2.expander(':information_source: FILE INFO'):
# st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)
else:
placeholder.warning('No file selected yet.', icon='πŸ“’')
##################################### TAB 2 ########################################
patterns = [["Tags Instrument",
"BOOKMARK",
"instrument",
"[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
"(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)",
"",
]
]
st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
st.session_state.df_pattern.index.name = "Pattern #"
st.session_state.flag=False
help_lines = """
:blue[Name] give a string with the name/type to be displayed in the output list
:blue[Where] give a list [...] of strings with following options:
- ["TEXT"] = search in plain PDF text
- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.
- ["LAYER", <list>] = search in layers named in <list> as a list of strings
- ["PATH"] = search pattern in path name.
- ["TOC"] = search pattern in table of content.
:blue[Include] give a regex string for the patterns to include
:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2
:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
"""
warn_flag = True
where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
df_config = {
'Name':st.column_config.TextColumn('Name',
required=True
),
'Where': st.column_config.TextColumn('Where',
help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
default='TEXT',
required=True,
validate='|'.join(where_keywords)
),
'Labels': st.column_config.TextColumn('Labels',
help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
),
'Include':st.column_config.TextColumn('Include',
help='For examples of REGEXs please refer to https://regex101.com/',
required=True,
validate='\S'
),
'Exclude':st.column_config.TextColumn('Exclude',
help='For examples of REGEXs please refer to https://regex101.com/',
required=False,
default='',
#validate='\S'
)
}
with tab2:
if 'df_error' not in st.session_state:
st.session_state.df_error = False
st.header('REGEX dictionary')
with st.expander(':question: HELP'):
st.markdown(help_lines)
tab2_placehld = st.empty()
st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
column_config=df_config,
use_container_width=True,
num_rows='dynamic',
#disabled=['Check'],
key='TT')
if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
st.session_state.df_error = False
for i, row in st.session_state.df_pattern.iterrows():
if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
st.session_state.df_error = True
tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='πŸ“’')
try:
re.compile(row['Include'])
except re.error:
tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='πŸ“’')
st.session_state.df_error = True
if row['Exclude']==None:
st.session_state.df_pattern.loc[i,'Exclude']=''
else:
try:
re.compile(row['Exclude'])
except re.error:
tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='πŸ“’')
st.session_state.df_error = True
##################################### TAB 3 ########################################
patterns = st.session_state.df_pattern.values.tolist()
st.session_state.df = pd.DataFrame({})
with tab3:
col1, col2 = st.columns(2)
tab3_placehld = col1.empty()
filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
filename = filename.split('.')[0]
rm_duplicates = col1.checkbox('remove duplicates?', value=True)
one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
btn = col1.button('Extract tags')
tab3_placehld2 = col1.empty()
if btn and len(st.session_state.pdf_files) == 0:
tab3_placehld.warning('No files selected!', icon='β›”')
if btn and len(st.session_state.pdf_files) > 0 :
tag_list = []
error_list = []
progress_text = 'Extraction on-going'
progress_bar = col2.progress(0, text=progress_text)
for i, file in enumerate(st.session_state.pdf_files):
tag_ls, err_txt = extract_tag(file, patterns)
tag_list.extend(tag_ls)
error_list.extend(err_txt)
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
if rm_duplicates:
st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
if st.session_state.df.shape[0] > 0:
buffer = BytesIO()
with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
if one_sheet:
st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
else:
for category in pd.unique(st.session_state.df['Tag type']):
st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
#excel.close()
col2.download_button('πŸ“₯ Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
else:
col1.warning(f'File empty! Not written.', icon='❌')