#!python # # Engineering PDF tag extractor # by Serge Jaumain / SPIE Oil & Gas Services # # 31/05/2023 # importing required modules import re import pandas as pd import fitz import streamlit as st from io import BytesIO def find_pattern(text, include, exclude, remove): """Find pattern in but exclude . Finally it removes strings from result Args: text (string): Text to be scanned include (string): REGEX expression to extract patterns from text exclude (string): REGEX expression to exclude patterns from search in text remove (string): string to remove from result Returns: string: pattern filtered out """ if remove == None: remove = '' if include == None: include = '' find = re.findall(include, re.sub(remove, '', text)) if not exclude: filtered = find else: filtered = [el for el in find if re.findall(exclude,el)==[]] clean = filtered #if remove != []: # for txt in remove: # clean = [el.replace(txt, '') for el in clean] return clean def get_from_text(doc, include, exclude, remove): """Retrieves visible layer text from PDF Args: doc (fitz document): actual pdf document to extract include (string): contains the regex string of tags to include exclude (string): contains the regex string of tags to exclude remove (string): contains a list of string patterns to remove at the end Returns: list: raw list of tags found """ # switch on all layers doc_layers = doc.layer_ui_configs() for doc_layer in doc_layers: doc.set_layer_ui_config(doc_layer['number'], action=0) text = '|'.join([page.get_text() for page in doc]) return find_pattern(text, include, exclude, remove) def get_from_toc(doc, include, exclude, remove): """Retrieves TOC from PDF Args: doc (fitz document): actual pdf document to extract include (string): contains the regex string of tags to include exclude (string): contains the regex string of tags to exclude remove (string): contains a list of string patterns to remove at the end Returns: list: raw list of tags found """ text = doc.get_toc() return find_pattern(text, include, exclude, remove) def get_bookmark(doc, bm_text, include, exclude, remove): """Retrieves the bookmarks from PDF Args: doc (fitz document): actual pdf document to extract bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive) include (string): contains the regex string of tags to include exclude (_type_): contains the regex string of tags to exclude remove (string): contains a list of string patterns to remove at the end Returns: list: list like [tag1, tag2, ...] """ items = doc.get_toc() tags = [] flag = False for item in items: if bm_text == '$': clean = find_pattern(item[1], include, exclude, remove) tags.extend(clean) else: if item[0] == 1: flag = bm_text.upper() in item[1].upper() else: if flag: clean = find_pattern(item[1], include, exclude, remove) tags.extend(clean) return tags def get_layer(doc, layer2search, include, exclude, remove): """Retrieves visible layer text from PDF Args: doc (fitz document): actual pdf document to extract layern (string): contains the layer name of the layer to be extracted include (string): contains the regex string of tags to include exclude (string): contains the regex string of tags to exclude remove (string): contains a list of string patterns to remove at the end Returns: list: raw list of tags found """ doc_layers = doc.layer_ui_configs() # swith on all layers if "$" is found somewhere # else switch off all layers not wanted for layersearched in layer2search: if layersearched.strip()[0] == "$": for layer in doc_layers: doc.set_layer_ui_config(layer['number'], action=0) break else: for layer in doc_layers: if layer['text'] in layersearched.strip(): doc.set_layer_ui_config(layer['number'], action=0) else: doc.set_layer_ui_config(layer['number'], action=2) # get all pages text = '|'.join([page.get_text() for page in doc]) return find_pattern(text, include, exclude, remove) def extract_tag(file, patterns): """Extracts pattern list from Args: file (file object): PDF file object to be extracted patterns (list): dictionnary of patterns Returns: list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...] """ # creating a pdf reader object doc = fitz.open(stream=file.read(), filetype='pdf') # go through all patterns to be detected tag_list = [] for pattern in patterns: pname = pattern[0].strip() where = pattern[1].strip().upper() label = pattern[2].strip() include = pattern[3] exclude = pattern[4] remove = pattern[5] error_txt = '' if where == "TEXT": tags = get_from_text(doc, include, exclude, remove) elif where == "TOC": tags = get_from_toc(doc, include, exclude, remove) elif where == "BOOKMARK": tags = get_bookmark(doc, label, include, exclude, remove) elif where == "LAYER": tags = get_layer(doc, label, include, exclude, remove) #if len(label) == 1: # tags = get_layer(doc, [], include, exclude, remove) #else: # tags = [] # for layer in label: # tags.append(get_layer(doc, layer, include, exclude, remove)) elif where == "PATH": tags = find_pattern(file.name, include, exclude, remove) else: error_txt = where + 'does not exist' for tag in tags: tag_list.append([pname, tag, file.name]) return tag_list, error_txt def file_info(file_list): res = {"File":[] ,"Pages":[], "Wheres":[]} files = os.dup(file_list) for file in files: doc = fitz.open(stream=file.read(), filetype='pdf') res['File'].append(file.name) res['Pages'].append(doc.page_count) where_file = [] if len(doc.layer_ui_configs()) > 0: where_file.append('LAYER') if ''.join([page.get_text() for page in doc]) != '': where_file.append('TEXT') if len(doc.get_toc()) > 0: where_file.append('BOOKMARK') res['Wheres'].append(where_file) doc.close() return pd.DataFrame(res) ##################################### Define Streamlit interface ######################################## st.set_page_config(layout="wide") st.markdown('## **PDF tag Extractor**') st.markdown('**v2.40** (June 2023 / S. Jaumain)') #st.markdown('###### by S. Jaumain') tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result']) ##################################### TAB 1 ######################################## with tab1: st.subheader('Choose your PDF file(s):') placeholder = st.empty() #placeholder2 = st.empty() st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True) # check existence of PDF files if st.session_state.pdf_files: placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='✅') #with placeholder2.expander(':information_source: FILE INFO'): # st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True) else: placeholder.warning('No file selected yet.', icon='📢') ##################################### TAB 2 ######################################## patterns = [["Tags Instrument", "BOOKMARK", "instrument", "[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}", "(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)", "", ] ] st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove']) st.session_state.df_pattern.index.name = "Pattern #" st.session_state.flag=False help_lines = """ :blue[Name] give a string with the name/type to be displayed in the output list :blue[Where] give a list [...] of strings with following options: - ["TEXT"] = search in plain PDF text - ["BOOKMARK",