Spaces:

spie-ogs
/

pdf-extractor

Configuration error

App Files Files Community

jase64 commited on Jun 23, 2023

Commit

aa0784d

•

1 Parent(s): 82452ac

Create app.py

Browse files

Files changed (1) hide show

app.py +374 -0

app.py ADDED Viewed

	@@ -0,0 +1,374 @@

+#!python
+# # Engineering PDF tag extractor
+# by Serge Jaumain / SPIE Oil & Gas Services
+#
+# 31/05/2023
+# importing required modules
+import re
+import pandas as pd
+import fitz
+import streamlit as st
+from io import BytesIO
+def find_pattern(text, include, exclude, remove):
+    """Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result
+    Args:
+        text (string): Text to be scanned
+        include (string): REGEX expression to extract patterns from text
+        exclude (string): REGEX expression to exclude patterns from search in text
+        remove (string): string to remove from result
+    Returns:
+        string: pattern filtered out
+    """
+    if remove == None:
+        remove = ''
+    if include == None:
+        include = ''
+    find = re.findall(include, re.sub(remove, '', text))
+    if not exclude:
+        filtered = find
+    else:
+        filtered = [el for el in find if re.findall(exclude,el)==[]]
+    clean = filtered
+    #if remove != []:
+    #    for txt in remove:
+    #        clean = [el.replace(txt, '') for el in clean]
+    return clean
+def get_from_text(doc, include, exclude, remove):
+    """Retrieves visible layer text from PDF
+    Args:
+        doc (fitz document): actual pdf document to extract
+        include (string): contains the regex string of tags to include
+        exclude (string): contains the regex string of tags to exclude
+        remove (string): contains a list of string patterns to remove at the end
+    Returns:
+        list: raw list of tags found
+    """
+    # switch on all layers
+    doc_layers = doc.layer_ui_configs()
+    for doc_layer in doc_layers:
+        doc.set_layer_ui_config(doc_layer['number'], action=0)
+    text = '|'.join([page.get_text() for page in doc])
+    return find_pattern(text, include, exclude, remove)
+def get_from_toc(doc, include, exclude, remove):
+    """Retrieves TOC from PDF
+    Args:
+        doc (fitz document): actual pdf document to extract
+        include (string): contains the regex string of tags to include
+        exclude (string): contains the regex string of tags to exclude
+        remove (string): contains a list of string patterns to remove at the end
+    Returns:
+        list: raw list of tags found
+    """
+    text = doc.get_toc()
+    return find_pattern(text, include, exclude, remove)
+def get_bookmark(doc, bm_text, include, exclude, remove):
+    """Retrieves the bookmarks from PDF
+    Args:
+        doc (fitz document): actual pdf document to extract
+        bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
+        include (string): contains the regex string of tags to include
+        exclude (_type_): contains the regex string of tags to exclude
+        remove (string): contains a list of string patterns to remove at the end
+    Returns:
+        list: list like [tag1, tag2, ...]
+    """
+    items = doc.get_toc()
+    tags = []
+    flag = False
+    for item in items:
+        if bm_text == '$':
+            clean = find_pattern(item[1], include, exclude, remove)
+            tags.extend(clean)
+        else:
+            if item[0] == 1:
+                flag = bm_text.upper() in item[1].upper()
+            else:
+                if flag:
+                    clean = find_pattern(item[1], include, exclude, remove)
+                    tags.extend(clean)
+    return tags
+def get_layer(doc, layer2search, include, exclude, remove):
+    """Retrieves visible layer text from PDF
+    Args:
+        doc (fitz document): actual pdf document to extract
+        layern (string): contains the layer name of the layer to be extracted
+        include (string): contains the regex string of tags to include
+        exclude (string): contains the regex string of tags to exclude
+        remove (string): contains a list of string patterns to remove at the end
+    Returns:
+        list: raw list of tags found
+    """
+    doc_layers = doc.layer_ui_configs()
+    # swith on all layers if "$" is found somewhere
+    # else switch off all layers not wanted
+    for layersearched in layer2search:
+        if layersearched.strip()[0] == "$":
+            for layer in doc_layers:
+                doc.set_layer_ui_config(layer['number'], action=0)
+            break
+        else:
+            for layer in doc_layers:
+                if layer['text'] in layersearched.strip():
+                    doc.set_layer_ui_config(layer['number'], action=0)
+                else:
+                    doc.set_layer_ui_config(layer['number'], action=2)
+    # get all pages
+    text = '|'.join([page.get_text() for page in doc])
+    return find_pattern(text, include, exclude, remove)
+def extract_tag(file, patterns):
+    """Extracts pattern list <patterns> from <file>
+    Args:
+        file (file object): PDF file object to be extracted
+        patterns (list): dictionnary of patterns
+    Returns:
+        list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
+    """
+    # creating a pdf reader object
+    doc = fitz.open(stream=file.read(), filetype='pdf')
+    # go through all patterns to be detected
+    tag_list = []
+    for pattern in patterns:
+        pname = pattern[0].strip()
+        where = pattern[1].strip().upper()
+        label = pattern[2].strip()
+        include = pattern[3]
+        exclude = pattern[4]
+        remove = pattern[5]
+        error_txt = ''
+        if where == "TEXT":
+            tags = get_from_text(doc, include, exclude, remove)
+        elif where == "TOC":
+            tags = get_from_toc(doc, include, exclude, remove)
+        elif where == "BOOKMARK":
+            tags = get_bookmark(doc, label, include, exclude, remove)
+        elif where == "LAYER":
+            tags = get_layer(doc, label, include, exclude, remove)
+            #if len(label) == 1:
+            #    tags = get_layer(doc, [], include, exclude, remove)
+            #else:
+            #    tags = []
+            #    for layer in label:
+            #        tags.append(get_layer(doc, layer, include, exclude, remove))
+        elif where == "PATH":
+            tags = find_pattern(file.name, include, exclude, remove)
+        else:
+            error_txt = where + 'does not exist'
+        for tag in tags:
+            tag_list.append([pname, tag, file.name])
+    return tag_list, error_txt
+def file_info(file_list):
+    res = {"File":[] ,"Pages":[], "Wheres":[]}
+    files = os.dup(file_list)
+    for file in files:
+        doc = fitz.open(stream=file.read(), filetype='pdf')
+        res['File'].append(file.name)
+        res['Pages'].append(doc.page_count)
+        where_file = []
+        if len(doc.layer_ui_configs()) > 0:
+            where_file.append('LAYER')
+        if ''.join([page.get_text() for page in doc]) != '':
+            where_file.append('TEXT')
+        if len(doc.get_toc()) > 0:
+            where_file.append('BOOKMARK')
+        res['Wheres'].append(where_file)
+        doc.close()
+    return pd.DataFrame(res)
+##################################### Define Streamlit interface ########################################
+st.set_page_config(layout="wide")
+st.markdown('## **PDF tag Extractor**')
+st.markdown('**v2.40** (June 2023 / S. Jaumain)')
+#st.markdown('###### by S. Jaumain')
+tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])
+##################################### TAB 1 ########################################
+with tab1:
+    st.subheader('Choose your PDF file(s):')
+    placeholder = st.empty()
+    #placeholder2 = st.empty()
+    st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
+    # check existence of PDF files
+    if st.session_state.pdf_files:
+        placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='✅')
+        #with placeholder2.expander(':information_source: FILE INFO'):
+        #    st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)
+    else:
+        placeholder.warning('No file selected yet.', icon='📢')
+##################################### TAB 2 ########################################
+patterns = [["Tags Instrument",
+             "BOOKMARK",
+             "instrument",
+             "[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
+             "(PIC|[A-Z]{2,3}V|TAL|PAL|FAL|TAH|PAH|FAH|TAHH|PAHH|FAHH|TALL|PALL|FALL)",
+             "",
+             ]
+        ]
+st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
+st.session_state.df_pattern.index.name = "Pattern #"
+st.session_state.flag=False
+help_lines = """
+:blue[Name] give a string with the name/type to be displayed in the output list
+:blue[Where] give a list [...] of strings with following options:
+- ["TEXT"] = search in plain PDF text
+- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.
+- ["LAYER", <list>] = search in layers named in <list> as a list of strings
+- ["PATH"] = search pattern in path name.
+- ["TOC"] = search pattern in table of content.
+:blue[Include] give a regex string for the patterns to include
+:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2
+:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
+"""
+warn_flag = True
+where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
+df_config = {
+    'Name':st.column_config.TextColumn('Name',
+                                       required=True
+                                        ),
+    'Where': st.column_config.TextColumn('Where',
+                                          help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
+                                          default='TEXT',
+                                          required=True,
+                                          validate='|'.join(where_keywords)
+                                          ),
+    'Labels': st.column_config.TextColumn('Labels',
+                                          help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
+                                          ),
+    'Include':st.column_config.TextColumn('Include',
+                                          help='For examples of REGEXs please refer to https://regex101.com/',
+                                          required=True,
+                                          validate='\S'
+                                          ),
+    'Exclude':st.column_config.TextColumn('Exclude',
+                                          help='For examples of REGEXs please refer to https://regex101.com/',
+                                          required=False,
+                                          default='',
+                                          #validate='\S'
+                                        )
+}
+with tab2:
+    if 'df_error' not in st.session_state:
+        st.session_state.df_error = False
+    st.header('REGEX dictionary')
+    with st.expander(':question: HELP'):
+        st.markdown(help_lines)
+    tab2_placehld = st.empty()
+    st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
+                                                 column_config=df_config,
+                                                 use_container_width=True,
+                                                 num_rows='dynamic',
+                                                 #disabled=['Check'],
+                                                 key='TT')
+    if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
+        st.session_state.df_error = False
+        for i, row in st.session_state.df_pattern.iterrows():
+            if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
+                st.session_state.df_error = True
+                tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='📢')
+            try:
+                re.compile(row['Include'])
+            except re.error:
+                tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='📢')
+                st.session_state.df_error = True
+            if row['Exclude']==None:
+                st.session_state.df_pattern.loc[i,'Exclude']=''
+            else:
+                try:
+                    re.compile(row['Exclude'])
+                except re.error:
+                    tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='📢')
+                    st.session_state.df_error = True
+##################################### TAB 3 ########################################
+patterns = st.session_state.df_pattern.values.tolist()
+st.session_state.df = pd.DataFrame({})
+with tab3:
+    col1, col2 = st.columns(2)
+    tab3_placehld = col1.empty()
+    filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
+    filename = filename.split('.')[0]
+    rm_duplicates = col1.checkbox('remove duplicates?', value=True)
+    one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
+    btn = col1.button('Extract tags')
+    tab3_placehld2 = col1.empty()
+    if btn and len(st.session_state.pdf_files) == 0:
+        tab3_placehld.warning('No files selected!', icon='⛔')
+    if btn and len(st.session_state.pdf_files) > 0 :
+        tag_list = []
+        error_list = []
+        progress_text = 'Extraction on-going'
+        progress_bar = col2.progress(0, text=progress_text)
+        for i, file in enumerate(st.session_state.pdf_files):
+            tag_ls, err_txt = extract_tag(file, patterns)
+            tag_list.extend(tag_ls)
+            error_list.extend(err_txt)
+            progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
+        progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
+        st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
+        if rm_duplicates:
+            st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
+        col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
+        col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
+        if st.session_state.df.shape[0] > 0:
+            buffer = BytesIO()
+            with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
+                if one_sheet:
+                    st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
+                else:
+                    for category in pd.unique(st.session_state.df['Tag type']):
+                        st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
+                #excel.close()
+            col2.download_button('📥 Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
+        else:
+            col1.warning(f'File empty! Not written.', icon='❌')