Spaces:

spie-ogs
/

pdf-extractor

Configuration error

File size: 15,364 Bytes

aa0784d

#!python
# # Engineering PDF tag extractor
# by Serge Jaumain / SPIE Oil & Gas Services
# 
# 31/05/2023

# importing required modules
import re
import pandas as pd
import fitz
import streamlit as st
from io import BytesIO


def find_pattern(text, include, exclude, remove):
    """Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result

    Args:
        text (string): Text to be scanned
        include (string): REGEX expression to extract patterns from text
        exclude (string): REGEX expression to exclude patterns from search in text
        remove (string): string to remove from result

    Returns:
        string: pattern filtered out
    """
    if remove == None:
        remove = ''
    if include == None:
        include = ''
    find = re.findall(include, re.sub(remove, '', text))
    if not exclude:
        filtered = find
    else:
        filtered = [el for el in find if re.findall(exclude,el)==[]]
    clean = filtered
    #if remove != []:
    #    for txt in remove:
    #        clean = [el.replace(txt, '') for el in clean]
    return clean

def get_from_text(doc, include, exclude, remove):
    """Retrieves visible layer text from PDF

    Args:
        doc (fitz document): actual pdf document to extract
        include (string): contains the regex string of tags to include
        exclude (string): contains the regex string of tags to exclude
        remove (string): contains a list of string patterns to remove at the end

    Returns:
        list: raw list of tags found
    """
    # switch on all layers
    doc_layers = doc.layer_ui_configs()
    for doc_layer in doc_layers:
        doc.set_layer_ui_config(doc_layer['number'], action=0)
    
    text = '|'.join([page.get_text() for page in doc])
    
    return find_pattern(text, include, exclude, remove)

def get_from_toc(doc, include, exclude, remove):
    """Retrieves TOC from PDF

    Args:
        doc (fitz document): actual pdf document to extract
        include (string): contains the regex string of tags to include
        exclude (string): contains the regex string of tags to exclude
        remove (string): contains a list of string patterns to remove at the end

    Returns:
        list: raw list of tags found
    """    
    text = doc.get_toc()
    
    return find_pattern(text, include, exclude, remove)

def get_bookmark(doc, bm_text, include, exclude, remove):
    """Retrieves the bookmarks from PDF

    Args:
        doc (fitz document): actual pdf document to extract
        bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
        include (string): contains the regex string of tags to include
        exclude (_type_): contains the regex string of tags to exclude
        remove (string): contains a list of string patterns to remove at the end

    Returns:
        list: list like [tag1, tag2, ...]
    """
    items = doc.get_toc()
    tags = []
    flag = False
    for item in items:
        if bm_text == '$':
            clean = find_pattern(item[1], include, exclude, remove)
            tags.extend(clean)
        else:
            if item[0] == 1:
                flag = bm_text.upper() in item[1].upper()
            else:
                if flag:
                    clean = find_pattern(item[1], include, exclude, remove)
                    tags.extend(clean)
    return tags

def get_layer(doc, layer2search, include, exclude, remove):
    """Retrieves visible layer text from PDF

    Args:
        doc (fitz document): actual pdf document to extract
        layern (string): contains the layer name of the layer to be extracted
        include (string): contains the regex string of tags to include
        exclude (string): contains the regex string of tags to exclude
        remove (string): contains a list of string patterns to remove at the end

    Returns:
        list: raw list of tags found
    """
    doc_layers = doc.layer_ui_configs()
    # swith on all layers if "$" is found somewhere
    # else switch off all layers not wanted
    for layersearched in layer2search:   
        if layersearched.strip()[0] == "$":
            for layer in doc_layers:
                doc.set_layer_ui_config(layer['number'], action=0)
            break
        else:
            for layer in doc_layers:
                if layer['text'] in layersearched.strip():
                    doc.set_layer_ui_config(layer['number'], action=0)
                else:
                    doc.set_layer_ui_config(layer['number'], action=2)
        
    # get all pages        
    text = '|'.join([page.get_text() for page in doc])
    
    return find_pattern(text, include, exclude, remove)

def extract_tag(file, patterns):
    """Extracts pattern list <patterns> from <file>

    Args:
        file (file object): PDF file object to be extracted
        patterns (list): dictionnary of patterns

    Returns:
        list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
    """
    # creating a pdf reader object
    doc = fitz.open(stream=file.read(), filetype='pdf')
    # go through all patterns to be detected
    tag_list = []
    for pattern in patterns:
        pname = pattern[0].strip()
        where = pattern[1].strip().upper()
        label = pattern[2].strip()
        include = pattern[3]
        exclude = pattern[4]
        remove = pattern[5]
        error_txt = ''
        if where == "TEXT":
            tags = get_from_text(doc, include, exclude, remove)
        elif where == "TOC":
            tags = get_from_toc(doc, include, exclude, remove)
        elif where == "BOOKMARK":
            tags = get_bookmark(doc, label, include, exclude, remove)
        elif where == "LAYER":
            tags = get_layer(doc, label, include, exclude, remove)
            #if len(label) == 1:
            #    tags = get_layer(doc, [], include, exclude, remove)
            #else:
            #    tags = []
            #    for layer in label:
            #        tags.append(get_layer(doc, layer, include, exclude, remove))
        elif where == "PATH":
            tags = find_pattern(file.name, include, exclude, remove)
        else:
            error_txt = where + 'does not exist'
    
        for tag in tags:
            tag_list.append([pname, tag, file.name])
            
    return tag_list, error_txt

def file_info(file_list):
    res = {"File":[] ,"Pages":[], "Wheres":[]}
    files = os.dup(file_list)
    for file in files:
        doc = fitz.open(stream=file.read(), filetype='pdf')
        res['File'].append(file.name)
        res['Pages'].append(doc.page_count)
        where_file = []
        if len(doc.layer_ui_configs()) > 0:
            where_file.append('LAYER')
        if ''.join([page.get_text() for page in doc]) != '':
            where_file.append('TEXT')
        if len(doc.get_toc()) > 0:
            where_file.append('BOOKMARK')
        res['Wheres'].append(where_file)
        doc.close()
    
    return pd.DataFrame(res)

##################################### Define Streamlit interface ########################################
st.set_page_config(layout="wide") 
st.markdown('## **PDF tag Extractor**')
st.markdown('**v2.40** (June 2023 / S. Jaumain)')
#st.markdown('###### by S. Jaumain')


tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])

##################################### TAB 1 ########################################
with tab1:
    st.subheader('Choose your PDF file(s):')
    placeholder = st.empty()
    #placeholder2 = st.empty()
    st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
    # check existence of PDF files
    if st.session_state.pdf_files:
        placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='✅')
        #with placeholder2.expander(':information_source: FILE INFO'):
        #    st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)

    else:
        placeholder.warning('No file selected yet.', icon='📢')
    
##################################### TAB 2 ########################################
patterns = [["Tags Instrument",
             "BOOKMARK",
             "instrument",
             "[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
             "(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)",
             "",
             ]
        ]

st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
st.session_state.df_pattern.index.name = "Pattern #"
st.session_state.flag=False

help_lines = """
:blue[Name] give a string with the name/type to be displayed in the output list

:blue[Where] give a list [...] of strings with following options:

- ["TEXT"] = search in plain PDF text
            
- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.
            
- ["LAYER", <list>] = search in layers named in <list> as a list of strings
            
- ["PATH"] = search pattern in path name.

- ["TOC"] = search pattern in table of content.
            
:blue[Include] give a regex string for the patterns to include
    
:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2

:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
"""
warn_flag = True
where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
df_config = {
    'Name':st.column_config.TextColumn('Name',
                                       required=True
                                        ),
    'Where': st.column_config.TextColumn('Where',
                                          help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
                                          default='TEXT',
                                          required=True,
                                          validate='|'.join(where_keywords)
                                          ),
    'Labels': st.column_config.TextColumn('Labels',
                                          help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
                                          ),
    'Include':st.column_config.TextColumn('Include',
                                          help='For examples of REGEXs please refer to https://regex101.com/',
                                          required=True,
                                          validate='\S'
                                          ),
    'Exclude':st.column_config.TextColumn('Exclude',
                                          help='For examples of REGEXs please refer to https://regex101.com/',
                                          required=False,
                                          default='',
                                          #validate='\S'
                                        )    
}

with tab2:
    if 'df_error' not in st.session_state:
        st.session_state.df_error = False
    st.header('REGEX dictionary')
    with st.expander(':question: HELP'):
        st.markdown(help_lines)

    tab2_placehld = st.empty()
    
    st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
                                                 column_config=df_config,
                                                 use_container_width=True,
                                                 num_rows='dynamic',
                                                 #disabled=['Check'],
                                                 key='TT')
   
    if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
        st.session_state.df_error = False
        for i, row in st.session_state.df_pattern.iterrows():
            
            if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
                st.session_state.df_error = True
                tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='📢')
                
            try:
                re.compile(row['Include'])
            except re.error:
                tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='📢')
                st.session_state.df_error = True
            
            if row['Exclude']==None:
                st.session_state.df_pattern.loc[i,'Exclude']='' 
            else:               
                try:
                    re.compile(row['Exclude'])
                except re.error:
                    tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='📢')
                    st.session_state.df_error = True
        
   
##################################### TAB 3 ########################################
patterns = st.session_state.df_pattern.values.tolist()
st.session_state.df = pd.DataFrame({})
with tab3:
    col1, col2 = st.columns(2)
    tab3_placehld = col1.empty()
    filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
    filename = filename.split('.')[0]
    rm_duplicates = col1.checkbox('remove duplicates?', value=True)
    one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
    btn = col1.button('Extract tags')
    tab3_placehld2 = col1.empty()
    if btn and len(st.session_state.pdf_files) == 0:
        tab3_placehld.warning('No files selected!', icon='⛔')
    if btn and len(st.session_state.pdf_files) > 0 :
        tag_list = []
        error_list = []
        progress_text = 'Extraction on-going'
        progress_bar = col2.progress(0, text=progress_text)
        for i, file in enumerate(st.session_state.pdf_files):
            tag_ls, err_txt = extract_tag(file, patterns)
            tag_list.extend(tag_ls)
            error_list.extend(err_txt)
            progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
        progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
        st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
        if rm_duplicates:
            st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
        col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
        col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
        if st.session_state.df.shape[0] > 0:
            buffer = BytesIO()
            with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
                if one_sheet:
                    st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
                else:
                    for category in pd.unique(st.session_state.df['Tag type']):
                        st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
                #excel.close()
            col2.download_button('📥 Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
        else:
            col1.warning(f'File empty! Not written.', icon='❌')