Spaces:

spie-ogs
/

pdf-extractor

Configuration error

pdf-extractor / app.py

Create app.py

aa0784d 12 months ago

15.4 kB

	#!python
	# # Engineering PDF tag extractor
	# by Serge Jaumain / SPIE Oil & Gas Services
	#
	# 31/05/2023

	# importing required modules
	import re
	import pandas as pd
	import fitz
	import streamlit as st
	from io import BytesIO


	def find_pattern(text, include, exclude, remove):
	"""Find pattern <include> in <text> but exclude <exclude>. Finally it removes <remove> strings from result

	Args:
	text (string): Text to be scanned
	include (string): REGEX expression to extract patterns from text
	exclude (string): REGEX expression to exclude patterns from search in text
	remove (string): string to remove from result

	Returns:
	string: pattern filtered out
	"""
	if remove == None:
	remove = ''
	if include == None:
	include = ''
	find = re.findall(include, re.sub(remove, '', text))
	if not exclude:
	filtered = find
	else:
	filtered = [el for el in find if re.findall(exclude,el)==[]]
	clean = filtered
	#if remove != []:
	# for txt in remove:
	# clean = [el.replace(txt, '') for el in clean]
	return clean

	def get_from_text(doc, include, exclude, remove):
	"""Retrieves visible layer text from PDF

	Args:
	doc (fitz document): actual pdf document to extract
	include (string): contains the regex string of tags to include
	exclude (string): contains the regex string of tags to exclude
	remove (string): contains a list of string patterns to remove at the end

	Returns:
	list: raw list of tags found
	"""
	# switch on all layers
	doc_layers = doc.layer_ui_configs()
	for doc_layer in doc_layers:
	doc.set_layer_ui_config(doc_layer['number'], action=0)

	text = '\|'.join([page.get_text() for page in doc])

	return find_pattern(text, include, exclude, remove)

	def get_from_toc(doc, include, exclude, remove):
	"""Retrieves TOC from PDF

	Args:
	doc (fitz document): actual pdf document to extract
	include (string): contains the regex string of tags to include
	exclude (string): contains the regex string of tags to exclude
	remove (string): contains a list of string patterns to remove at the end

	Returns:
	list: raw list of tags found
	"""
	text = doc.get_toc()

	return find_pattern(text, include, exclude, remove)

	def get_bookmark(doc, bm_text, include, exclude, remove):
	"""Retrieves the bookmarks from PDF

	Args:
	doc (fitz document): actual pdf document to extract
	bm_text (string): contains a string for the selection of the bookmarks to search (not case sensitive)
	include (string): contains the regex string of tags to include
	exclude (_type_): contains the regex string of tags to exclude
	remove (string): contains a list of string patterns to remove at the end

	Returns:
	list: list like [tag1, tag2, ...]
	"""
	items = doc.get_toc()
	tags = []
	flag = False
	for item in items:
	if bm_text == '$':
	clean = find_pattern(item[1], include, exclude, remove)
	tags.extend(clean)
	else:
	if item[0] == 1:
	flag = bm_text.upper() in item[1].upper()
	else:
	if flag:
	clean = find_pattern(item[1], include, exclude, remove)
	tags.extend(clean)
	return tags

	def get_layer(doc, layer2search, include, exclude, remove):
	"""Retrieves visible layer text from PDF

	Args:
	doc (fitz document): actual pdf document to extract
	layern (string): contains the layer name of the layer to be extracted
	include (string): contains the regex string of tags to include
	exclude (string): contains the regex string of tags to exclude
	remove (string): contains a list of string patterns to remove at the end

	Returns:
	list: raw list of tags found
	"""
	doc_layers = doc.layer_ui_configs()
	# swith on all layers if "$" is found somewhere
	# else switch off all layers not wanted
	for layersearched in layer2search:
	if layersearched.strip()[0] == "$":
	for layer in doc_layers:
	doc.set_layer_ui_config(layer['number'], action=0)
	break
	else:
	for layer in doc_layers:
	if layer['text'] in layersearched.strip():
	doc.set_layer_ui_config(layer['number'], action=0)
	else:
	doc.set_layer_ui_config(layer['number'], action=2)

	# get all pages
	text = '\|'.join([page.get_text() for page in doc])

	return find_pattern(text, include, exclude, remove)

	def extract_tag(file, patterns):
	"""Extracts pattern list <patterns> from <file>

	Args:
	file (file object): PDF file object to be extracted
	patterns (list): dictionnary of patterns

	Returns:
	list: [[pattern name1, tag1, filename1], [pattern name2, tag2, ...]
	"""
	# creating a pdf reader object
	doc = fitz.open(stream=file.read(), filetype='pdf')
	# go through all patterns to be detected
	tag_list = []
	for pattern in patterns:
	pname = pattern[0].strip()
	where = pattern[1].strip().upper()
	label = pattern[2].strip()
	include = pattern[3]
	exclude = pattern[4]
	remove = pattern[5]
	error_txt = ''
	if where == "TEXT":
	tags = get_from_text(doc, include, exclude, remove)
	elif where == "TOC":
	tags = get_from_toc(doc, include, exclude, remove)
	elif where == "BOOKMARK":
	tags = get_bookmark(doc, label, include, exclude, remove)
	elif where == "LAYER":
	tags = get_layer(doc, label, include, exclude, remove)
	#if len(label) == 1:
	# tags = get_layer(doc, [], include, exclude, remove)
	#else:
	# tags = []
	# for layer in label:
	# tags.append(get_layer(doc, layer, include, exclude, remove))
	elif where == "PATH":
	tags = find_pattern(file.name, include, exclude, remove)
	else:
	error_txt = where + 'does not exist'

	for tag in tags:
	tag_list.append([pname, tag, file.name])

	return tag_list, error_txt

	def file_info(file_list):
	res = {"File":[] ,"Pages":[], "Wheres":[]}
	files = os.dup(file_list)
	for file in files:
	doc = fitz.open(stream=file.read(), filetype='pdf')
	res['File'].append(file.name)
	res['Pages'].append(doc.page_count)
	where_file = []
	if len(doc.layer_ui_configs()) > 0:
	where_file.append('LAYER')
	if ''.join([page.get_text() for page in doc]) != '':
	where_file.append('TEXT')
	if len(doc.get_toc()) > 0:
	where_file.append('BOOKMARK')
	res['Wheres'].append(where_file)
	doc.close()

	return pd.DataFrame(res)

	##################################### Define Streamlit interface ########################################
	st.set_page_config(layout="wide")
	st.markdown('## PDF tag Extractor')
	st.markdown('v2.40 (June 2023 / S. Jaumain)')
	#st.markdown('###### by S. Jaumain')


	tab1, tab2, tab3 = st.tabs(['File Selection', 'Patterns', 'Result'])

	##################################### TAB 1 ########################################
	with tab1:
	st.subheader('Choose your PDF file(s):')
	placeholder = st.empty()
	#placeholder2 = st.empty()
	st.session_state.pdf_files = st.file_uploader("Choose the PDFs to upload for extraction", type=['pdf'], accept_multiple_files=True)
	# check existence of PDF files
	if st.session_state.pdf_files:
	placeholder.success(f'{len(st.session_state.pdf_files)} PDF files uploaded. Proceed to next step', icon='✅')
	#with placeholder2.expander(':information_source: FILE INFO'):
	# st.dataframe(file_info(st.session_state.pdf_files), use_container_width=True, hide_index=True)

	else:
	placeholder.warning('No file selected yet.', icon='📢')

	##################################### TAB 2 ########################################
	patterns = [["Tags Instrument",
	"BOOKMARK",
	"instrument",
	"[A-Z]{5}-[A-Z]{2,4}-[0-9]{6}",
	"(PIC\|[A-Z]{2,3}V\|TAL\|PAL\|FAL\|TAH\|PAH\|FAH\|TAHH\|PAHH\|FAHH\|TALL\|PALL\|FALL)",
	"",
	]
	]

	st.session_state.df_pattern = pd.DataFrame(patterns, columns=['Name','Where','Labels','Include','Exclude','Remove'])
	st.session_state.df_pattern.index.name = "Pattern #"
	st.session_state.flag=False

	help_lines = """
	:blue[Name] give a string with the name/type to be displayed in the output list

	:blue[Where] give a list [...] of strings with following options:

	- ["TEXT"] = search in plain PDF text

	- ["BOOKMARK",<label>] = search in bookmarks with name containing <label>. if <name>="$" then all.

	- ["LAYER", <list>] = search in layers named in <list> as a list of strings

	- ["PATH"] = search pattern in path name.

	- ["TOC"] = search pattern in table of content.

	:blue[Include] give a regex string for the patterns to include

	:blue[Exclude] give a regex string for the patterns to exclude. :red[BEWARE:] exclude has priority 2

	:blue[Remove] a list of strings to be removed from found patterns :red[BEWARE:] remove has priority 1
	"""
	warn_flag = True
	where_keywords = ['TEXT', 'PATH', 'BOOKMARK', 'LAYER', 'TOC']
	df_config = {
	'Name':st.column_config.TextColumn('Name',
	required=True
	),
	'Where': st.column_config.TextColumn('Where',
	help='Indicate where to search. Can be '+', '.join(where_keywords)+'.',
	default='TEXT',
	required=True,
	validate='\|'.join(where_keywords)
	),
	'Labels': st.column_config.TextColumn('Labels',
	help='Indicate the label of Bookmark or Layer to search in. For all use "$".',
	),
	'Include':st.column_config.TextColumn('Include',
	help='For examples of REGEXs please refer to https://regex101.com/',
	required=True,
	validate='\S'
	),
	'Exclude':st.column_config.TextColumn('Exclude',
	help='For examples of REGEXs please refer to https://regex101.com/',
	required=False,
	default='',
	#validate='\S'
	)
	}

	with tab2:
	if 'df_error' not in st.session_state:
	st.session_state.df_error = False
	st.header('REGEX dictionary')
	with st.expander(':question: HELP'):
	st.markdown(help_lines)

	tab2_placehld = st.empty()

	st.session_state.df_pattern = st.data_editor(st.session_state.df_pattern,
	column_config=df_config,
	use_container_width=True,
	num_rows='dynamic',
	#disabled=['Check'],
	key='TT')

	if st.session_state.TT['edited_rows'] != {} or st.session_state.TT['added_rows'] != {}:
	st.session_state.df_error = False
	for i, row in st.session_state.df_pattern.iterrows():

	if row['Where'] in ['BOOKMARK', 'LAYER'] and row['Labels']=='':
	st.session_state.df_error = True
	tab2_placehld.warning('"'+row['Name']+'" row: missing <LABEL> error. Required with Bookmarks or Layers.', icon='📢')

	try:
	re.compile(row['Include'])
	except re.error:
	tab2_placehld.warning('"'+row['Name']+'" row: Include REGEX pattern not valid. Refer to HELP', icon='📢')
	st.session_state.df_error = True

	if row['Exclude']==None:
	st.session_state.df_pattern.loc[i,'Exclude']=''
	else:
	try:
	re.compile(row['Exclude'])
	except re.error:
	tab2_placehld.warning('"'+row['Name']+'" row: Exclude REGEX pattern not valid. Refer to HELP', icon='📢')
	st.session_state.df_error = True


	##################################### TAB 3 ########################################
	patterns = st.session_state.df_pattern.values.tolist()
	st.session_state.df = pd.DataFrame({})
	with tab3:
	col1, col2 = st.columns(2)
	tab3_placehld = col1.empty()
	filename = col1.text_input('XLSX output file name for extracted tags:', value='tags.xlsx')
	filename = filename.split('.')[0]
	rm_duplicates = col1.checkbox('remove duplicates?', value=True)
	one_sheet = col1.checkbox('All extraction categories on one single sheet?', value=True)
	btn = col1.button('Extract tags')
	tab3_placehld2 = col1.empty()
	if btn and len(st.session_state.pdf_files) == 0:
	tab3_placehld.warning('No files selected!', icon='⛔')
	if btn and len(st.session_state.pdf_files) > 0 :
	tag_list = []
	error_list = []
	progress_text = 'Extraction on-going'
	progress_bar = col2.progress(0, text=progress_text)
	for i, file in enumerate(st.session_state.pdf_files):
	tag_ls, err_txt = extract_tag(file, patterns)
	tag_list.extend(tag_ls)
	error_list.extend(err_txt)
	progress_bar.progress((i+1)/len(st.session_state.pdf_files), text=progress_text)
	progress_bar.progress((i+1)/len(st.session_state.pdf_files), text="Completed")
	st.session_state.df = pd.DataFrame(tag_list, columns=['Tag type','Tag','Origin file'])
	if rm_duplicates:
	st.session_state.df = st.session_state.df.drop_duplicates(subset=['Tag','Origin file'])
	col2.success(f'Tag(s) found: {st.session_state.df.shape[0]}')
	col2.dataframe(st.session_state.df, use_container_width=True, hide_index=True)
	if st.session_state.df.shape[0] > 0:
	buffer = BytesIO()
	with pd.ExcelWriter(buffer, engine='xlsxwriter') as excel:
	if one_sheet:
	st.session_state.df.to_excel(excel, sheet_name='tags', index=False)
	else:
	for category in pd.unique(st.session_state.df['Tag type']):
	st.session_state.df[st.session_state.df['Tag type'] == category].to_excel(excel, sheet_name=category, index=False)
	#excel.close()
	col2.download_button('📥 Download as XLSX', data=buffer, file_name= filename + '.xlsx', mime='application/vnd.ms-excel')
	else:
	col1.warning(f'File empty! Not written.', icon='❌')